Editing Openai/693e3ce6-229c-8008-97dc-ab720cb1f95a (section)

=== Assistant: Below is a self-contained LaTeX document that proves a bound on the sum of iterate supports over the transient pre-iden… ===
Below is a self-contained LaTeX document that proves a bound on the sum of iterate supports over the transient (pre-identification) phase, using a margin/strict-complementarity parameter for the ℓ1\ell_1ℓ1-regularized PageRank objective (3) from your open-problem note. open-problem-fountoulakis22a

<syntaxhighlight lang="latex">\documentclass[11pt]{article}

\usepackage{amsmath,amssymb,amsthm,mathtools}
\usepackage[margin=1in]{geometry}
\usepackage{hyperref}

% --------------------
% Theorem environments
% --------------------
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{assumption}{Assumption}
\theoremstyle{remark}
\newtheorem{remark}{Remark}

% --------------------
% Macros
% --------------------
\newcommand{\R}{\mathbb{R}}
\newcommand{\norm}[1]{\left\lVert #1 \right\rVert}
\newcommand{\ip}[2]{\left\langle #1,#2 \right\rangle}
\newcommand{\prox}{\operatorname{prox}}
\newcommand{\supp}{\operatorname{supp}}
\newcommand{\diag}{\operatorname{diag}}

\title{Bounding the Sum of Supports in the Transient of Accelerated Proximal Gradient via a Margin Parameter}
\author{}
\date{}

\begin{document}
\maketitle

\begin{abstract}
We study accelerated proximal-gradient methods for composite objectives of the form
$F(x)=f(x)+\sum_{i=1}^n \lambda_i |x_i|$ where $f$ is smooth and strongly convex.
Under a strict-complementarity \emph{margin} condition at the minimizer $x^\star$, we bound:
(i) the number of \emph{spurious} active coordinates (outside $\supp(x^\star)$) at each iteration in terms
of $\|y_k-x^\star\|_2$, and
(ii) the \emph{sum of supports} $\sum_{k=0}^{K-1}|\supp(x_{k+1})|$ over the \emph{transient} phase
before inactive coordinates are permanently identified as zero.
We also specialize constants to the $\ell_1$-regularized PageRank objective (COLT'22 open problem).
\end{abstract}

\section{Composite model, support sets, and a margin parameter}

Let
\begin{equation}\label{eq:prob}
\min_{x\in\R^n} F(x) := f(x) + g(x),
\qquad
g(x) := \sum_{i=1}^n \lambda_i |x_i|,
\quad \lambda_i>0.
\end{equation}
Assume $f$ is differentiable, $\nabla f$ is $L$-Lipschitz in $\ell_2$, and $F$ is $\mu$-strongly convex
($\mu>0$). Then $F$ has a unique minimizer $x^\star$.

Define the optimal support and its complement
\[
S^\star := \supp(x^\star),\qquad I^\star := \{1,\dots,n\}\setminus S^\star.
\]

\subsection{Weighted soft-thresholding proximal map}

\begin{lemma}[Prox of weighted $\ell_1$]\label{lem:prox}
For any $t>0$ and $u\in\R^n$,
\[
\prox_{t g}(u)_i = \operatorname{sign}(u_i)\,\max\{|u_i|-t\lambda_i,\,0\}.
\]
In particular, $\prox_{t g}(u)_i\neq 0$ if and only if $|u_i|>t\lambda_i$.
\end{lemma}

\begin{proof}
The proximal subproblem is separable over coordinates:
$\min_x \frac{1}{2t}\|x-u\|_2^2 + \sum_i \lambda_i|x_i|$, giving weighted soft-thresholding.
\end{proof}

\subsection{Margin / strict complementarity}

First-order optimality gives $0\in\nabla f(x^\star)+\partial g(x^\star)$.
In particular, for any $i\in I^\star$ (so $x_i^\star=0$), one has $|\nabla f(x^\star)_i|\le \lambda_i$.

\begin{assumption}[Margin parameter]\label{ass:margin}
Define
\begin{equation}\label{eq:gamma}
\gamma := \min_{i\in I^\star}\bigl(\lambda_i - |\nabla f(x^\star)_i|\bigr).
\end{equation}
Assume $\gamma>0$.
Equivalently, every inactive coordinate satisfies $|\nabla f(x^\star)_i|\le \lambda_i-\gamma$.
\end{assumption}

\begin{remark}
Assumption~\ref{ass:margin} is a standard strict-complementarity condition for $\ell_1$ regularization.
Without $\gamma>0$, finite-time identification and sharp sparsity control can fail in worst case.
\end{remark}

\section{Accelerated proximal gradient}

Fix a step size $t\in(0,1/L]$ and a momentum parameter $\beta\in[0,1)$.
Consider the inertial/accelerated proximal-gradient iteration
\begin{equation}\label{eq:apg}
\begin{aligned}
y_k &= x_k + \beta (x_k-x_{k-1}),\\
x_{k+1} &= \prox_{t g}\!\bigl(y_k - t\nabla f(y_k)\bigr),
\end{aligned}
\qquad k\ge 0,
\end{equation}
initialized with
\begin{equation}\label{eq:init}
x_{-1}=x_0=0.
\end{equation}
(Thus $y_0=x_0=0$.)

Define the \emph{spurious active set at step $k$}
\begin{equation}\label{eq:Ak}
A_k := \supp(x_{k+1})\cap I^\star
= \{i\in I^\star:\ x_{k+1,i}\neq 0\}.
\end{equation}

\section{Key counting lemma: spurious activations cost distance to $x^\star$}

Define the forward (gradient) map input
\[
u(y) := y - t\nabla f(y).
\]

\begin{lemma}[Each spurious active coordinate forces a $t\gamma$-jump]\label{lem:tgammastep}
Assume Assumption~\ref{ass:margin}. Fix any $y\in\R^n$ and set $x^+(y)=\prox_{t g}(u(y))$.
Let $A(y):=\supp(x^+(y))\cap I^\star$.
Then for every $i\in A(y)$,
\[
|u(y)_i-u(x^\star)_i| > t\gamma,
\]
and consequently
\begin{equation}\label{eq:counting}
|A(y)| \le \frac{\|u(y)-u(x^\star)\|_2^2}{t^2\gamma^2}.
\end{equation}
\end{lemma}

\begin{proof}
Fix $i\in I^\star$. Then $x_i^\star=0$ and $u(x^\star)_i=-t\nabla f(x^\star)_i$, so
$|u(x^\star)_i|=t|\nabla f(x^\star)_i|\le t(\lambda_i-\gamma)$.
If $i\in A(y)$ then $x^+(y)_i\neq 0$, hence by Lemma~\ref{lem:prox} we have $|u(y)_i|>t\lambda_i$.
Therefore
\[
|u(y)_i-u(x^\star)_i|\ge |u(y)_i|-|u(x^\star)_i|>t\lambda_i-t(\lambda_i-\gamma)=t\gamma.
\]
Summing over $i\in A(y)$ gives $\|u(y)-u(x^\star)\|_2^2>\sum_{i\in A(y)}(t\gamma)^2=|A(y)|t^2\gamma^2$,
which implies \eqref{eq:counting}.
\end{proof}

\begin{lemma}[Lipschitz transfer to $y$-distance]\label{lem:uLip}
If $\nabla f$ is $L$-Lipschitz, then for all $y$,
\[
\|u(y)-u(x^\star)\|_2 \le (1+tL)\|y-x^\star\|_2.
\]
Hence,
\begin{equation}\label{eq:Ak_vs_y}
|A(y)| \le \left(\frac{1+tL}{t\gamma}\right)^2 \|y-x^\star\|_2^2.
\end{equation}
\end{lemma}

\begin{proof}
$u(y)-u(x^\star)=(y-x^\star)-t(\nabla f(y)-\nabla f(x^\star))$.
Take norms and use $\|\nabla f(y)-\nabla f(x^\star)\|_2\le L\|y-x^\star\|_2$.
Combine with Lemma~\ref{lem:tgammastep}.
\end{proof}

Applying Lemma~\ref{lem:uLip} to $y=y_k$ gives the per-iteration spurious-support bound
\begin{equation}\label{eq:Ak_bound_general}
|A_k| \le \left(\frac{1+tL}{t\gamma}\right)^2 \|y_k-x^\star\|_2^2.
\end{equation}

\section{Bounding $\|y_k-x^\star\|$ using a standard accelerated linear rate}

To obtain an explicit \emph{sum} bound, we use a standard accelerated linear convergence statement.

\begin{assumption}[Standard accelerated linear convergence]\label{ass:linear_rate}
There exist constants $q\in(0,1)$ and $\Delta_0>0$ such that
\begin{equation}\label{eq:linear_rate}
F(x_k)-F(x^\star) \le \Delta_0\, q^k\qquad \text{for all }k\ge 0.
\end{equation}
With initialization $x_0=0$, we have $\Delta_0=F(0)-F(x^\star)$.
\end{assumption}

\begin{remark}
For strongly convex composite objectives, a \emph{standard} accelerated proximal-gradient method
with step $t=1/L$ and a constant momentum choice (e.g.\ Nesterov's $\beta=\frac{\sqrt{L}-\sqrt{\mu}}{\sqrt{L}+\sqrt{\mu}}$)
achieves \eqref{eq:linear_rate} with $q\approx 1-\sqrt{\mu/L}$ (up to standard constant-factor variants).
The identification/sparsity argument below only needs \eqref{eq:linear_rate} and strong convexity.
\end{remark}

\begin{lemma}[From function gap to $y_k$-distance]\label{lem:yk_distance}
Assume $F$ is $\mu$-strongly convex and Assumption~\ref{ass:linear_rate} holds.
Then for all $k\ge 1$,
\begin{equation}\label{eq:yk_dist_bound}
\|y_k-x^\star\|_2^2 \le \frac{4\Delta_0}{\mu}\Bigl( (1+\beta)^2 q^k + \beta^2 q^{k-1}\Bigr)
\le \underbrace{\frac{4\Delta_0}{\mu}\Bigl((1+\beta)^2 q + \beta^2\Bigr)}_{=:M}\, q^{k-1}.
\end{equation}
Also, $\|y_0-x^\star\|_2^2=\|x_0-x^\star\|_2^2 \le \frac{2\Delta_0}{\mu}$.
\end{lemma}

\begin{proof}
Strong convexity gives $F(x)-F(x^\star)\ge \frac{\mu}{2}\|x-x^\star\|_2^2$, hence
$\|x_k-x^\star\|_2^2 \le \frac{2}{\mu}(F(x_k)-F(x^\star)) \le \frac{2\Delta_0}{\mu}q^k$.

For $k\ge 1$, write
$y_k-x^\star=(1+\beta)(x_k-x^\star)-\beta(x_{k-1}-x^\star)$ and use
$\|a-b\|_2^2 \le 2\|a\|_2^2 + 2\|b\|_2^2$ to get
\[
\|y_k-x^\star\|_2^2 \le 2(1+\beta)^2\|x_k-x^\star\|_2^2 + 2\beta^2\|x_{k-1}-x^\star\|_2^2.
\]
Substitute the bounds on $\|x_k-x^\star\|_2^2$ and $\|x_{k-1}-x^\star\|_2^2$.
The second inequality in \eqref{eq:yk_dist_bound} uses $q^k\le q^{k-1}$ for $k\ge 1$.
For $k=0$, $y_0=x_0$ and the strong convexity bound gives the claim.
\end{proof}

\section{Transient identification and the sum of supports}

The margin implies that once $y_k$ is sufficiently close to $x^\star$, \emph{no} inactive coordinate can be activated.

\begin{lemma}[Stability radius for inactivity]\label{lem:stability}
Assume Assumption~\ref{ass:margin}. Define
\begin{equation}\label{eq:r_def}
r := \frac{t\gamma}{1+tL}.
\end{equation}
If $\|y-x^\star\|_2\le r$, then $A(y)=\emptyset$ (i.e.\ $\supp(\prox_{tg}(u(y)))\subseteq S^\star$).
In particular, if $\|y_k-x^\star\|_2\le r$ then $A_k=\emptyset$.
\end{lemma}

\begin{proof}
If $\|y-x^\star\|_2\le r$, then by Lemma~\ref{lem:uLip},
$\|u(y)-u(x^\star)\|_2\le (1+tL)\|y-x^\star\|_2\le t\gamma$.
Then \eqref{eq:counting} forces $|A(y)|\le 1$ but in fact the strict inequality in Lemma~\ref{lem:tgammastep}
shows no coordinate can satisfy $|u(y)_i-u(x^\star)_i|>t\gamma$; hence $A(y)=\emptyset$.
\end{proof}

Define the \emph{(margin-based) identification time}
\begin{equation}\label{eq:Kdef}
K_\gamma := \min\{k\ge 0:\ \|y_k-x^\star\|_2\le r\}.
\end{equation}
Then by Lemma~\ref{lem:stability}, $A_k=\emptyset$ for all $k\ge K_\gamma$.
We call iterations $k=0,1,\dots,K_\gamma-1$ the \emph{transient phase}.

\begin{lemma}[Explicit bound on $K_\gamma$]\label{lem:Kbound}
Assume Assumption~\ref{ass:linear_rate} and let $M$ be as in Lemma~\ref{lem:yk_distance}.
Then
\[
K_\gamma \le 1 + \left\lceil \frac{\log(M/r^2)}{\log(1/q)} \right\rceil.
\]
\end{lemma}

\begin{proof}
For $k\ge 1$, Lemma~\ref{lem:yk_distance} gives $\|y_k-x^\star\|_2^2\le M q^{k-1}$.
It suffices that $M q^{k-1}\le r^2$, i.e.\ $k-1\ge \log(M/r^2)/\log(1/q)$.
\end{proof}

\subsection{Main theorem: a bound on $\sum_{k=0}^{K_\gamma-1}|\supp(x_{k+1})|$}

\begin{theorem}[Sum of supports over the transient]\label{thm:sum_support_transient}
Assume Assumptions~\ref{ass:margin} and \ref{ass:linear_rate}, and run \eqref{eq:apg}--\eqref{eq:init}.
Let $K_\gamma$ be defined by \eqref{eq:Kdef}. Then
\begin{equation}\label{eq:sum_support_main}
\sum_{k=0}^{K_\gamma-1} |\supp(x_{k+1})|
\;\le\;
|S^\star|\,K_\gamma
\;+\;
\left(\frac{1+tL}{t\gamma}\right)^2
\left(
\frac{2\Delta_0}{\mu}
+\frac{4\Delta_0}{\mu}\cdot \frac{(1+\beta)^2 q + \beta^2}{1-q}
\right).
\end{equation}
Moreover, $K_\gamma$ satisfies Lemma~\ref{lem:Kbound}.

In particular, if one uses the standard choices $t=1/L$ and $q\approx 1-\sqrt{\mu/L}$,
then the \emph{spurious part} of the sum in \eqref{eq:sum_support_main} is
\[
O\!\left(\frac{\Delta_0\,L^{5/2}}{\gamma^2\,\mu^{3/2}}\right),
\]
and $K_\gamma = O\!\left(\sqrt{\tfrac{L}{\mu}}\log \tfrac{1}{\gamma}\right)$ (up to constants hidden in $M$).
\end{theorem}

\begin{proof}
For each $k$, $\supp(x_{k+1}) = (\supp(x_{k+1})\cap S^\star)\cup A_k$, hence
$|\supp(x_{k+1})|\le |S^\star| + |A_k|$.
Summing over $k=0,\dots,K_\gamma-1$ yields
\[
\sum_{k=0}^{K_\gamma-1} |\supp(x_{k+1})|
\le |S^\star|K_\gamma + \sum_{k=0}^{K_\gamma-1}|A_k|
\le |S^\star|K_\gamma + \sum_{k=0}^{\infty}|A_k|.
\]
Now use \eqref{eq:Ak_bound_general} with Lemma~\ref{lem:yk_distance}:
\[
|A_0|\le \left(\frac{1+tL}{t\gamma}\right)^2 \|y_0-x^\star\|_2^2
\le \left(\frac{1+tL}{t\gamma}\right)^2 \frac{2\Delta_0}{\mu},
\]
and for $k\ge 1$,
\[
|A_k| \le \left(\frac{1+tL}{t\gamma}\right)^2 \|y_k-x^\star\|_2^2
\le
\left(\frac{1+tL}{t\gamma}\right)^2
\frac{4\Delta_0}{\mu}\Bigl( (1+\beta)^2 q^k + \beta^2 q^{k-1}\Bigr).
\]
Sum the geometric series:
$\sum_{k=1}^\infty q^k = \frac{q}{1-q}$ and $\sum_{k=1}^\infty q^{k-1}=\frac{1}{1-q}$.
This gives \eqref{eq:sum_support_main}.
The asymptotic statements follow by inserting $t=1/L$ so $(1+tL)/(t\gamma)=2L/\gamma$ and
$1/(1-q)\asymp \sqrt{L/\mu}$ when $q\approx 1-\sqrt{\mu/L}$.
\end{proof}

\begin{remark}[Interpretation]
Theorem~\ref{thm:sum_support_transient} bounds the \emph{total amount of sparsity work} accumulated
before strict complementarity (margin) forces all inactive coordinates to stay zero.
The bound splits cleanly into:
(i) a `<code>true-support'' term $|S^\star|K_\gamma$, and
(ii) a purely margin-controlled </code>`spurious'' term scaling like $\gamma^{-2}$ times a condition-number factor.
\end{remark}

\section{Specialization: $\ell_1$-regularized PageRank}

We record the standard constants for the $\ell_1$-regularized PageRank objective (COLT'22 open problem):
\[
F(x)= \frac12 x^\top Qx - \alpha x^\top D^{-1/2}s \;+\; \rho\alpha \|D^{1/2}x\|_1,
\]
with $Q=\alpha I + \frac{1-\alpha}{2}L_{\rm sym}$ and $L_{\rm sym}$ the normalized Laplacian.
For undirected graphs, $\lambda(L_{\rm sym})\subseteq[0,2]$, hence $\lambda(Q)\subseteq[\alpha,1]$.
Therefore $\nabla f$ is $L$-Lipschitz with $L\le 1$, and $F$ is $\mu$-strongly convex with $\mu=\alpha$.
Also $\lambda_i = \rho\alpha\sqrt{d_i}$ in \eqref{eq:prob}.

\begin{corollary}[PageRank constants; explicit $\Delta_0$ bound from $x_0=0$]\label{cor:ppr}
Assume the seed is a single node $v$ so $s=e_v$.
With initialization $x_0=0$ we have $F(0)=0$ and
\[
\Delta_0 = F(0)-F(x^\star) = -F(x^\star) \le \frac{\alpha}{2d_v}\le \frac{\alpha}{2}.
\]
If we use the standard step $t=1$ (since $L\le 1$) then $r=\gamma/2$ and $(1+tL)/(t\gamma)\le 2/\gamma$.
Consequently, the spurious part of \eqref{eq:sum_support_main} is
\[
O\!\left(\frac{1}{\gamma^2\sqrt{\alpha}}\right)
\quad\text{(up to standard constant factors in the accelerated rate and the momentum choice).}
\]
Moreover, $K_\gamma = O\!\left(\frac{1}{\sqrt{\alpha}}\log\frac{1}{\gamma}\right)$.
\end{corollary}

\begin{proof}
Since $g(x)\ge 0$, we have $F(x)\ge f(x)$ and thus $F(x^\star)\ge \min_x f(x)$.
For $f(x)=\frac12 x^\top Qx - b^\top x$ with $b=\alpha D^{-1/2}s$, the unconstrained minimizer is
$x_f^\star=Q^{-1}b$ and $\min f = -\frac12 b^\top Q^{-1}b$.
Because $Q\succeq \alpha I$, we have $Q^{-1}\preceq \frac{1}{\alpha}I$ and hence
$b^\top Q^{-1}b \le \frac{1}{\alpha}\|b\|_2^2 = \frac{1}{\alpha}\alpha^2\|D^{-1/2}s\|_2^2
=\alpha\|D^{-1/2}s\|_2^2$.
For $s=e_v$, $\|D^{-1/2}s\|_2^2 = 1/d_v$, giving $\min f \ge -\alpha/(2d_v)$, hence
$-F(x^\star)\le -\min f \le \alpha/(2d_v)$.
The remaining statements follow by inserting $L\le 1$, $\mu=\alpha$, $t=1$ into
Theorem~\ref{thm:sum_support_transient} and Lemma~\ref{lem:Kbound}.
\end{proof}

\begin{remark}[What this bound does \emph{not} give]
The bound is explicit and dimension-free, but it depends on the margin $\gamma$.
To recover a purely $(\alpha,\rho)$-based locality bound (e.g.\ $\widetilde O((\sqrt{\alpha}\rho)^{-1})$)
one would need an additional argument lower-bounding $\gamma$ in terms of $\rho$ for the instance class of interest.
\end{remark}

\end{document}

</syntaxhighlight>