\[\begin{split}\operatorname{Oracle}_{\mathcal{F},\text{reg}}\left(\{z_i\},\{u_i\}\right) &= \operatorname{argmin}_{f\in\mathcal{F}}\frac{1}{n}\sum^n_{i=1}\left(u_i-f(z_i)\right)^2 \\
\operatorname{Oracle}_{\mathcal{F},\text{class}}\left(\{x_i\},\{v_i\}, \{w_i\}\right) &= \operatorname{argmax}_{f\in\mathcal{F}}\frac{1}{n}\sum^n_{i=1} w_i \Pr_{Z_i\sim\operatorname{Ber}\left(\frac{1+f(x_i)}{2}\right)}\left(Z_i = v_i \right)\end{split}\]
be oracles for the regression and (weighted) classification problems. For data \(A = \{a_1,\ldots, a_n\}\), define \(\mathcal{F}_A = \left\{\left(f\left(a_1\right), \ldots, f\left(a_n\right)\right): f \in \mathcal{F}\right\}\).
Estimator 3
For the joint estimator with ridge regularization
\[\begin{split}(\hat{g},\hat{h}) = \arg \min_{g\in\mathcal{G}, h \in \mathcal{H}}
\max_{f' \in \mathcal{F}} \mathbb{E}_n \left[ 2 \left\{ g(A) - Y \right\} f'(C') - f'(C')^2 \right]
+ \mu' \mathbb{E}_n \left\{ g(A)^2 \right\} \\
+ \max_{f \in \mathcal{F}} \mathbb{E}_n \left[ 2 \left\{ h(B) - g(A) \right\} f(C) - f(C)^2 \right]
+ \mu \mathbb{E}_n \left\{ h(B)^2 \right\}\end{split}\]
Ensemble solution
Consider the algorithm where for \(t=1, \ldots, T\):
\[\begin{split}\begin{aligned}
& u_i'^{t}=\left(y_i-\frac{1}{t-1} \sum_{\tau=1}^{t-1} g_\tau\left(a_i\right)\right), \quad u_i^t=\frac{1}{t-1} \sum_{\tau=1}^{t-1} \bigg(g_\tau\left(a_i\right)-h_\tau\left(b_i\right)\bigg)\\
& f'_t=\operatorname{Oracle}_{\mathcal{F'}, \text{reg}}\left(\{c_i'\}, \{u_i'^t\}\right),\quad f_t=\operatorname{Oracle}_{\mathcal{F}, \text{reg}}\left(\{c_i\}, \{u_i^t\}\right) \\
& v_i'^t=\frac{1}{\mu't}\sum_{\tau=1}^{t}\bigg(f'_\tau(c'_i)-f_\tau(c_i)\bigg), \quad \qquad v_i^t=\frac{1}{\mu t}\sum_{\tau=1}^{t}f_\tau(c_i)\\
&g_t=\operatorname{Oracle}_{\mathcal{G}, \text{reg}}\left(\{a_i\}, \{v_i'^t\}\right), \qquad h_t=\operatorname{Oracle}_{\mathcal{H}, \text{reg}}\left(\{b_i\}, \{v_i^t\}\right) \\
&
\end{aligned}\end{split}\]
Suppose that the sets \(\mathcal{F'}_{C'}\), \(\mathcal{F}_{C}\), \(\mathcal{G}_{A}\), \(\mathcal{H}_{B}\) are all convex sets. Then the ensembles: \(\bar{g}=\frac{1}{T} \sum_{t=1}^T g_t\), \(\bar{h}=\frac{1}{T} \sum_{t=1}^T h_t\), are a \(O\left(\frac{\log (T)+1}{T}\right)\)-approximate solution to the minimax problem.
ensemble2.Ensemble2IVL2([adversary, ...])
|
An extension of Ensemble2IV with L2 regularization and optional cross-validation to select the best regularization parameter. |
Proof
We can write the minimax problem as a convex-concave zero-sum game:
\[\min_{b\in \mathcal{H}_B, a\in \mathcal{G}_A}\max_{c\in \mathcal{F}_C, c'\in \mathcal{F'}_{C'}}
\frac{1}{n}\sum_{i=1}^{n}2(Y_i-a_i)c'_i-c_i'^2+2(a_i-b_i)c_i-c_i^2+\mu'a_i^2+\mu b_i^2\]
\[= \max_{b\in \mathcal{H}_B, a\in \mathcal{G}_A}\min_{c\in \mathcal{F}_C, c'\in \mathcal{F'}_{C'}}
\frac{1}{n} \underbrace{\sum_{i=1}^{n}c_i'^2-2(Y_i-a_i)c'_i+c_i^2-2(a_i-b_i)c_i-\mu'a_i^2-\mu b_i^2}_{:=\ell(\{c,c'\},\{a,b\})}\]
where the loss \(\ell(\{c,c'\},\{a,b\})\) is convex in \(\{c,c'\}\) and concave in \(\{a,b\}\).
The adversary chooses vector \((c_i,c_i')\) based on follow-the-leader (FTL):
\[\{c_t,c_t'\} = \operatorname{argmin}_{c\in \mathcal{F}_C, c'\in \mathcal{F'}_{C'}}
\frac{1}{t-1}\sum_{\tau=1}^{t-1}\ell(\{c,c'\},\{a_\tau,b_\tau\})\]
by separating the minimization and completing the square, we have that
\[\begin{split}c_t &= \operatorname{argmin}_{c \in \mathcal{F}_C} \frac{1}{n} \sum_{i=1}^{n} \left( c_i - \frac{1}{t-1} \sum_{\tau=1}^{t-1} \left\{ a_{i\tau} - b_{i\tau} \right\} \right)^2 \\
&= \operatorname{argmin}_{c \in \mathcal{F}_C} \frac{1}{n} \sum_{i=1}^{n} \left( c_i - u_i^{t} \right)^2 \\
&= \operatorname{Oracle}_{\mathcal{F}, \text{reg}} \left( \{c_i\}, \{u_i^t\} \right)\end{split}\]
\[\begin{split}c'_t &= \operatorname{argmin}_{c' \in \mathcal{F'}_{C'}} \frac{1}{n} \sum_{i=1}^{n} \left( c_i' - \frac{1}{t-1} \sum_{\tau=1}^{t-1} \left\{ y_i - a_{i\tau} \right\} \right)^2 \\
&= \operatorname{argmin}_{c' \in \mathcal{F'}_{C'}} \frac{1}{n} \sum_{i=1}^{n} \left( c_i' - u_i'^{t} \right)^2 \\
&= \operatorname{Oracle}_{\mathcal{F'}, \text{reg}} \left( \{c_i'\}, \{u_i'^t\} \right)\end{split}\]
Now, the learner plays be-the-leader (BTL) which involves choosing \((a_t,b_t)\) that best responds
\[\{a_t,b_t\} = \operatorname{argmax}_{a\in \mathcal{G}_A, b\in \mathcal{H}_{B}}
\frac{1}{t}\sum_{\tau=1}^{t}\ell(\{c_\tau,c'_\tau\},\{a,b\})\]
which after separating the minimization problem and completing the square we get:
\[\begin{split}a_t &= \operatorname{argmin}_{a \in \mathcal{G}_A} \frac{1}{n} \sum_{i=1}^{n}
\left( a_i - \frac{1}{\mu' t} \sum_{\tau=1}^{t} \left\{ c'_{i\tau} - c_{i\tau} \right\} \right)^2 \\
&= \operatorname{argmin}_{a \in \mathcal{G}_A} \frac{1}{n} \sum_{i=1}^{n} \left( a_i - v_i'^{t} \right)^2 \\
&= \operatorname{Oracle}_{\mathcal{G}, \text{reg}} \left( \{a_i\}, \{v_i'^t\} \right)\end{split}\]
\[\begin{split}b_t &= \operatorname{argmin}_{b \in \mathcal{H}_{B}} \frac{1}{n} \sum_{i=1}^{n}
\left( b_i - \frac{1}{\mu t} \sum_{\tau=1}^{t} c_{i\tau} \right)^2 \\
&= \operatorname{argmin}_{b \in \mathcal{H}_{B}} \frac{1}{n} \sum_{i=1}^{n} \left( b_i - v_i^{t} \right)^2 \\
&= \operatorname{Oracle}_{\mathcal{H}, \text{reg}} \left( \{b_i\}, \{v_i^t\} \right)\end{split}\]
Thus it remains to show that the ensembles
\[\bar{a} = \frac{1}{T}\sum_{t=1}^{T}a_t\,,\qquad \bar{b} = \frac{1}{T}\sum_{t=1}^{T}b_t\]
are also a solution to the empirical minimax problem.
Observe that the learner has zero regret, since it is playing the BTL algorithm. Thus if we show that the FTL algorithm has \(\operatorname{Regret}(T)\)-regret after \(T\) periods, then \((\bar{a},\bar{b})\) is an \(\epsilon(T)\) approximate solution to the minimax problem, invoking the results of Freund and Schapire (1999).
Hence, we now focus on the online learning problem that the adversary is facing and show that FTL is a no-regret algorithm with regret rate \(\operatorname{Regret}(T)=O\left(\frac{\log (T)}{T}\right)\). We can upper bound the regret of the FTL algorithm by:
\[\operatorname{Regret}(T) \leq \frac{1}{T} \sum_{t=1}^T\bigg(\ell\left(\{c_t, c'_t\}, \{a_t, b_t\}\right)-\ell\left(\{c_{t+1}, c'_{t+1}\}, \{a_t, b_t\}\right)\bigg)\]
The loss \(\ell\left(\cdot, \{a,b\}\right)\) is \(\frac{2}{n}\)-strongly convex with respect to the \(\|\cdot\|_2\)-norm on \(C\times C'\). Moreover the loss is \(O\left(\frac{1}{\sqrt{n}}\right)\)-Lipschitz, since
\[\nabla_{\{c,c'\}}\ell\left(\{c,c'\}, \{a,b\}\right) = \frac{2}{n}\left(\{c,c'\} - \{y-a,a-b\}\right)\]
so
\[\begin{split}\|\nabla_{\{c,c'\}}\ell\left(\{c,c'\}, \{a,b\}\right)\|_2
&= \frac{2}{n}\sqrt{\sum_{i=1}^{n}\left(c_i-(y_i-a_i)+c_i'-(a_i-b_i)\right)^2} \\
&\leq \frac{2}{n}\left(\|c\|_2+\|y\|_2+\|a\|_2+\|c'\|_2+\|a\|_2+\|b\|_2\right) \\
&\leq O\left(\frac{1}{\sqrt{n}}\right)\end{split}\]
Then \(L_t = \sum_{\tau=1}^t \ell(\cdot, \{a_\tau, b_\tau\})\) is \(\frac{2t}{n}\)-strongly convex. Since \(\{c_{t+1}, c'_{t+1}\}\) is a minimizer of \(L_t\) and the set \(C\times C'\) is convex, we have by the strong convexity and the first order condition that
\[\begin{split}L_t(\{c_t,c'_t\}) &\geq L_t(\{c_{t+1},c'_{t+1}\}) + \left\langle \{c_{t},c'_{t}\}-\{c_{t+1},c'_{t+1}\}, \nabla_{\{c,c'\}}L_t(\{c_{t+1},c'_{t+1}\})\right\rangle + \frac{t}{n}\left\|\{c_{t},c'_{t}\},\{c_{t+1},c'_{t+1}\}\right\|_2^2 \\
&\geq L_t(\{c_{t+1},c'_{t+1}\})+ \frac{t}{n}\left\|\{c_{t},c'_{t}\},\{c_{t+1},c'_{t+1}\}\right\|_2^2\end{split}\]
and
\[L_{t-1}(\{c_{t+1},c'_{t+1}\}) \geq L_{t-1}(\{c_{t},c'_{t}\})+ \frac{t}{n}\left\|\{c_{t},c'_{t}\},\{c_{t+1},c'_{t+1}\}\right\|_2^2\]
Adding the two previous equations and re-arranging gives:
\[\ell(\{c_t, c'_t\}, \{a_t, b_t\})-\ell(\{c_{t+1}, c'_{t+1}\}, \{a_{t}, b_{t}\})\geq \frac{2t}{n}\left\|\{c_{t},c'_{t}\},\{c_{t+1},c'_{t+1}\}\right\|_2^2\]
Invoking the Lipschitzness of \(\ell_t\):
\[\frac{K}{\sqrt{n}}\left\|\{c_{t},c'_{t}\},\{c_{t+1},c'_{t+1}\}\right\|_2\geq \frac{2t}{n}\left\|\{c_{t},c'_{t}\},\{c_{t+1},c'_{t+1}\}\right\|_2^2\]
so that
\[\left\|\{c_{t},c'_{t}\},\{c_{t+1},c'_{t+1}\}\right\|_2\leq \frac{K}{2}\frac{\sqrt{n}}{t}\]
Finally,
\[\begin{split} \operatorname{Regret}(T) &\leq \frac{1}{T} \sum_{t=1}^T \left( \ell\left(\{c_t, c'_t\}, \{a_t, b_t\}\right)
- \ell\left(\{c_{t+1}, c'_{t+1}\}, \{a_t, b_t\}\right) \right) \\
& \leq \frac{1}{T} \sum_{t=1}^T \frac{K}{\sqrt{n}} \left\|\{c_{t}, c'_{t}\}, \{c_{t+1}, c'_{t+1}\}\right\|_2 \\
& \leq \frac{1}{T} \sum_{t=1}^T \frac{K^2}{2} \frac{1}{t} \\
& \leq K^2 \frac{\log T + 1}{T}\end{split}\]
Subsetted estimator
For the subsetted estimator
\[\begin{split}(\hat{g},\hat{h}) = \arg \min _{g\in\mathcal{G}, h \in \mathcal{H}}
\max_{f' \in \mathcal{F}} \mathbb{E}_p\left[2\left\{g(A)-Y\right\} f'(C')-f'(C')^2\right]
+ \mu'\mathbb{E}_n\{g(A)^2\} \\
+ \max_{f \in \mathcal{F}} \mathbb{E}_q\left[2\left\{h(B)-g(A)\right\} f(C)-f(C)^2\right]
+ \mu\mathbb{E}_n\{h(B)^2\}\end{split}\]
We simply modify the updates for \(v_t', v_t\) as
\[v_i'^t = \frac{1}{\mu't}\sum_{\tau=1}^{t}\bigg(f'_\tau(c'_i)1\big(i\in[p]\big)-f_\tau(c_i)1\big(i\in[q]\big)\bigg), \quad
v_i^t = \frac{1}{\mu t}\sum_{\tau=1}^{t}f_\tau(c_i)1\big(i\in[q]\big)\]
Estimator 3 - (Function class bounded)
For the joint estimator:
\[\begin{split}(\hat{g}, \hat{h}) = \arg \min _{g \in \mathcal{G}, h \in \mathcal{H}}
\max_{f' \in \mathcal{F}} \mathbb{E}_n \left[ 2 \left\{ g(A) - Y \right\} f'(C') - f'(C')^2 \right] \\
+ \max_{f \in \mathcal{F}} \mathbb{E}_n \left[ 2 \left\{ h(B) - g(A) \right\} f(C) - f(C)^2 \right]\end{split}\]
Ensemble solution
Consider the algorithm where for \(t=1, \ldots, T\):
\[\begin{split}\begin{aligned}
& u_i'^{t}=\left(y_i-\frac{1}{t-1} \sum_{\tau=1}^{t-1} g_\tau\left(a_i\right)\right), \quad u_i^t=\frac{1}{t-1} \sum_{\tau=1}^{t-1} \bigg(g_\tau\left(a_i\right)-h_\tau\left(b_i\right)\bigg)\\
& f'_t=\operatorname{Oracle}_{\mathcal{F'}, \text{reg}}\left(\{c_i'\}, \{u_i'^t\}\right),\quad f_t=\operatorname{Oracle}_{\mathcal{F}, \text{reg}}\left(\{c_i\}, \{u_i^t\}\right) \\
& v_i'^t=1\big(f'_t(c'_i)-f_t(c_i)>0\big), \qquad v_i^t=1\big(f_t(c_i)>0\big)\\
& w_i'^t=\big|f'_t(c'_i)-f_t(c_i)\big|, \qquad \qquad w_i^t=|f_t(c_i)|\\
&g_t=\operatorname{Oracle}_{\mathcal{G}, \text{class}}\left(\{a_i\}, \{v_i'^t\}, \{w_i'^t\}\right), \quad h_t=\operatorname{Oracle}_{\mathcal{H}, \text{class}}\left(\{b_i\}, \{v_i^t\}, \{w_i^t\}\right) \\
&
\end{aligned}\end{split}\]
Suppose that the sets \(\mathcal{F'}_{C'}\), \(\mathcal{F}_{C}\) are convex sets. Then the ensembles: \(\bar{g} = \frac{1}{T} \sum_{t=1}^T g_t\), \(\bar{h} = \frac{1}{T} \sum_{t=1}^T h_t\), are a \(O \left( \frac{\log (T) + 1}{T} \right)\)-approximate solution to the minimax problem.
ensemble2.Ensemble2IV([adversary, learnerg, ...])
|
Implements a nested ensemble learning IV method with two adversaries and two learners. |
Proof
The proof is analogous to Ensemble solution, except that the learner best-responds to the current test function:
\[\begin{split}\begin{aligned}
\{a_t, b_t\} &= \operatorname{argmax}_{a \in \mathcal{G}_A, b \in \mathcal{H}_{B}} \ell(\{c_t, c'_t\}, \{a, b\}) \\
&= \operatorname{argmax}_{a \in \mathcal{G}_A, b \in \mathcal{H}_{B}} \sum_{i=1}^{n} c_{it}'^2 - 2(Y_i - a_i) c'_{it} + c_{it}^2 - 2(a_i - b_i) c_{it}
\end{aligned}\end{split}\]
which gives:
\[\begin{split}\begin{aligned}
a_t &= \operatorname{argmax}_{a \in \mathcal{G}_{A}} \sum_{i=1}^{n} a_i (c'_{it} - c_{it}) \\
&= \operatorname{argmax}_{a \in \mathcal{G}_{A}} \frac{1}{n} \sum_i a_i \left| c'_{it} - c_{it} \right| \operatorname{sign} \left( c'_{it} - c_{it} \right) \\
&= \operatorname{argmax}_{a \in \mathcal{G}_{A}} \frac{1}{n} \sum_i \left| c'_{it} - c_{it} \right| \mathbb{E}_{z \sim \operatorname{Bernoulli} \left( \frac{a_i + 1}{2} \right)} \left[ \left( 2 z_i - 1 \right) \operatorname{sign} \left( c'_{it} - c_{it} \right) \right] \\
&= \operatorname{argmax}_{a \in \mathcal{G}_{A}} \frac{1}{n} \sum_i \left| c'_{it} - c_{it} \right| \left( \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{a_i + 1}{2} \right)} \left[ \left( 2 z_i - 1 \right) = \operatorname{sign} \left( c'_{it} - c_{it} \right) \right] \right. \\
& \qquad \qquad \qquad \left. - \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{a_i + 1}{2} \right)} \left[ \left( 2 z_i - 1 \right) \neq \operatorname{sign} \left( c'_{it} - c_{it} \right) \right] \right) \\
&= \operatorname{argmax}_{a \in \mathcal{G}_{A}} \frac{1}{n} \sum_i \left| c'_{it} - c_{it} \right| \left( 2 \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{a_i + 1}{2} \right)} \left[ \left( 2 z_i - 1 \right) = \operatorname{sign} \left( c'_{it} - c_{it} \right) \right] - 1 \right) \\
&= \operatorname{argmax}_{a \in \mathcal{G}_{A}} \frac{1}{n} \sum_i \left| c'_{it} - c_{it} \right| \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{a_i + 1}{2} \right)} \left[ \left( 2 z_i - 1 \right) = \operatorname{sign} \left( c'_{it} - c_{it} \right) \right] \\
&= \operatorname{argmax}_{a \in \mathcal{G}_{A}} \frac{1}{n} \sum_i \left| c'_{it} - c_{it} \right| \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{a_i + 1}{2} \right)} \left[ z_i = \frac{\operatorname{sign} \left( c'_{it} - c_{it} \right) + 1}{2} \right] \\
&= \operatorname{argmax}_{a \in \mathcal{G}_{A}} \frac{1}{n} \sum_i \left| c'_{it} - c_{it} \right| \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{a_i + 1}{2} \right)} \left[ z_i = 1 \left\{ c'_{it} - c_{it} > 0 \right\} \right] \\
&= \operatorname{argmax}_{a \in \mathcal{G}_{A}} \frac{1}{n} \sum_i w_{it} \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{a_i + 1}{2} \right)} \left[ z_i = v_{it} \right] \\
&= \operatorname{Oracle}_{\mathcal{G}, \text{class}} \left( \{a_i\}, \{v_i'^t\}, \{w_i'^t\} \right)
\end{aligned}\end{split}\]
and
\[\begin{split}\begin{aligned}
b_t &= \operatorname{argmax}_{b \in \mathcal{H}_{B}} \sum_{i=1}^{n} b_i c_{it} \\
&= \operatorname{argmax}_{b \in \mathcal{H}_{B}} \frac{1}{n} \sum_i b_i \left| c_{it} \right| \operatorname{sign} \left( c_{it} \right) \\
&= \operatorname{argmax}_{b \in \mathcal{H}_{B}} \frac{1}{n} \sum_i \left| c_{it} \right| \mathbb{E}_{z \sim \operatorname{Bernoulli} \left( \frac{b_i + 1}{2} \right)} \left[ \left( 2 z_i - 1 \right) \operatorname{sign} \left( c_{it} \right) \right] \\
&= \operatorname{argmax}_{b \in \mathcal{H}_{B}} \frac{1}{n} \sum_i \left| c_{it} \right| \left( \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{b_i + 1}{2} \right)} \left[ \left( 2 z_i - 1 \right) = \operatorname{sign} \left( c_{it} \right) \right] \right. \\
& \qquad \qquad \qquad \left. - \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{b_i + 1}{2} \right)} \left[ \left( 2 z_i - 1 \right) \neq \operatorname{sign} \left( c_{it} \right) \right] \right) \\
&= \operatorname{argmax}_{b \in \mathcal{H}_{B}} \frac{1}{n} \sum_i \left| c_{it} \right| \left( 2 \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{b_i + 1}{2} \right)} \left[ \left( 2 z_i - 1 \right) = \operatorname{sign} \left( c_{it} \right) \right] - 1 \right) \\
&= \operatorname{argmax}_{b \in \mathcal{H}_{B}} \frac{1}{n} \sum_i \left| c_{it} \right| \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{b_i + 1}{2} \right)} \left[ \left( 2 z_i - 1 \right) = \operatorname{sign} \left( c_{it} \right) \right] \\
&= \operatorname{argmax}_{b \in \mathcal{H}_{B}} \frac{1}{n} \sum_i \left| c_{it} \right| \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{b_i + 1}{2} \right)} \left[ z_i = \frac{\operatorname{sign} \left( c_{it} \right) + 1}{2} \right] \\
&= \operatorname{argmax}_{b \in \mathcal{H}_{B}} \frac{1}{n} \sum_i \left| c_{it} \right| \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{b_i + 1}{2} \right)} \left[ z_i = 1 \left\{ c_{it} > 0 \right\} \right] \\
&= \operatorname{argmax}_{b \in \mathcal{H}_{B}} \frac{1}{n} \sum_i w_{it} \operatorname{Pr}_{z \sim \operatorname{Bernoulli} \left( \frac{b_i + 1}{2} \right)} \left[ z_i = v_{it} \right] \\
&= \operatorname{Oracle}_{\mathcal{H}, \text{class}} \left( \{b_i\}, \{v_i^t\}, \{w_i^t\} \right)
\end{aligned}\end{split}\]
Subsetted Estimator
For the subsetted estimator:
\[\begin{split}\begin{aligned}
(\hat{g}, \hat{h}) = \arg \min _{g \in \mathcal{G}, h \in \mathcal{H}}
\max_{f' \in \mathcal{F}} \mathbb{E}_p \left[ 2 \left\{ g(A) - Y \right\} f'(C') - f'(C')^2 \right] \\
+ \max_{f \in \mathcal{F}} \mathbb{E}_q \left[ 2 \left\{ h(B) - g(A) \right\} f(C) - f(C)^2 \right]
\end{aligned}\end{split}\]
We simply modify the updates for \((w_t', w_t)\) as:
\[\begin{split}\begin{aligned}
w_i'^t &= \left| f'_t(c'_i) 1\left( i \in [p] \right) - f_t(c_i) 1\left( i \in [q] \right) \right|, \\
w_i^t &= \left| f_t(c_i) 1\left( i \in [q] \right) \right|
\end{aligned}\end{split}\]