Editing Openai/6959ed27-accc-800e-8e87-21aa81e93c07 (section)

=== Assistant: cpp ===
<syntaxhighlight lang="cpp">// ellipse_bench_mine_vs_claude.cpp
// Compare:
//   (1) mine2      : curvature method, r/q single-sqrt, 2 iterations
//   (2) opt3       : curvature method, r/q single-sqrt, 3 iterations (0xfaded + r/q opt)
//   (3) claude_rot4: Newton + rotation trick, 4 iterations
//
// Build:
//   g++ -O3 -march=native -ffast-math -std=c++17 ellipse_bench_mine_vs_claude.cpp -o bench -lm
//
#include <cmath>
#include <cstdio>
#include <chrono>
#include <random>
#include <vector>
#include <algorithm>

using Clock = std::chrono::high_resolution_clock;

struct Point { float x, y; };

static inline float clamp01(float v) {
    return std::fmin(1.0f, std::fmax(0.0f, v));
}

// -----------------------------
// mine2: curvature, 2 iters
// -----------------------------
static inline Point mine2(float a, float b, float px, float py) {
    float px_abs = std::fabs(px), py_abs = std::fabs(py);
    float tx = 0.70710678f, ty = 0.70710678f;
    float a2 = a''a, b2 = b''b;
    float ca = (a2 - b2) / a;
    float cb = (b2 - a2) / b;

    // 2 iterations
    for (int i = 0; i < 2; ++i) {
        float x = a '' tx, y = b '' ty;
        float tx3 = tx''tx''tx;
        float ty3 = ty''ty''ty;
        float ex = ca * tx3;
        float ey = cb * ty3;
        float rx = x - ex, ry = y - ey;
        float qx = px_abs - ex, qy = py_abs - ey;

        // r/q combined into one sqrt (0xfaded optimization)
        float rq = std::sqrt((rx''rx + ry''ry) / (qx''qx + qy''qy + 1e-30f));

        float ntx = clamp01((qx * rq + ex) / a);
        float nty = clamp01((qy * rq + ey) / b);
        float invLen = 1.0f / std::sqrt(ntx''ntx + nty''nty);
        tx = ntx * invLen;
        ty = nty * invLen;
    }

    return { std::copysign(a '' tx, px), std::copysign(b '' ty, py) };
}

// -----------------------------
// opt3: curvature, 3 iters
// -----------------------------
static inline Point opt3(float a, float b, float px, float py) {
    float px_abs = std::fabs(px), py_abs = std::fabs(py);
    float tx = 0.70710678f, ty = 0.70710678f;
    float a2 = a''a, b2 = b''b;
    float ca = (a2 - b2) / a;
    float cb = (b2 - a2) / b;

    // 3 iterations (matches SO implementation)
    for (int i = 0; i < 3; ++i) {
        float x = a '' tx, y = b '' ty;
        float tx3 = tx''tx''tx;
        float ty3 = ty''ty''ty;
        float ex = ca * tx3;
        float ey = cb * ty3;
        float rx = x - ex, ry = y - ey;
        float qx = px_abs - ex, qy = py_abs - ey;

        float rq = std::sqrt((rx''rx + ry''ry) / (qx''qx + qy''qy + 1e-30f));

        float ntx = clamp01((qx * rq + ex) / a);
        float nty = clamp01((qy * rq + ey) / b);
        float invLen = 1.0f / std::sqrt(ntx''ntx + nty''nty);
        tx = ntx * invLen;
        ty = nty * invLen;
    }

    return { std::copysign(a '' tx, px), std::copysign(b '' ty, py) };
}

// -----------------------------
// claude_rot4: Newton rotation, 4 iters
// -----------------------------
static inline Point claude_rot4(float a, float b, float px, float py) {
    float px_abs = std::fabs(px), py_abs = std::fabs(py);
    float a2mb2 = a''a - b''b;

    // init: normalized direction in scaled space
    float nx = px_abs / a;
    float ny = py_abs / b;
    float len = std::sqrt(nx''nx + ny''ny + 1e-10f);
    float c = nx / len;
    float s = ny / len;

    for (int i = 0; i < 4; ++i) {
        float f  = a2mb2''s''c - px_abs''a''s + py_abs''b''c;
        float fp = a2mb2''(c''c - s''s) - px_abs''a''c - py_abs''b*s;
        if (std::fabs(fp) < 1e-10f) break;
        float dt = f / fp;

        // small-angle rotation update + renorm
        float nc = c + dt*s;
        float ns = s - dt*c;
        float inv = 1.0f / std::sqrt(nc''nc + ns''ns);
        c = nc * inv;
        s = ns * inv;
    }

    return { std::copysign(a '' c, px), std::copysign(b '' s, py) };
}

volatile float sink;
static inline void escape(Point p) { sink = p.x + p.y; }

template <typename F>
static double bench(F fn, float a, float b, const std::vector<Point>& pts, int runs) {
    // warmup
    for (int w = 0; w < 3; ++w) {
        for (const auto& p : pts) escape(fn(a, b, p.x, p.y));
    }

    std::vector<double> times;
    times.reserve(runs);

    for (int r = 0; r < runs; ++r) {
        auto t0 = Clock::now();
        for (const auto& p : pts) escape(fn(a, b, p.x, p.y));
        auto t1 = Clock::now();
        double ns = std::chrono::duration<double, std::nano>(t1 - t0).count();
        times.push_back(ns / (double)pts.size());
    }

    std::sort(times.begin(), times.end());
    return times[times.size() / 2]; // median
}

template <Point(*FN)(float,float,float,float)>
static float ellipse_eq_maxerr(float a, float b, const std::vector<Point>& pts) {
    float maxe = 0.0f;
    float invA = 1.0f / a;
    float invB = 1.0f / b;
    for (const auto& p : pts) {
        Point r = FN(a, b, p.x, p.y);
        float u = r.x * invA;
        float v = r.y * invB;
        float e = std::fabs(u''u + v''v - 1.0f);
        if (std::isfinite(e)) maxe = std::fmax(maxe, e);
    }
    return maxe;
}

int main() {
    std::mt19937 rng(42);
    std::uniform_real_distribution<float> angle(0.0f, 2.0f * (float)M_PI);
    std::uniform_real_distribution<float> radius(0.5f, 2.5f);

    const int N = 50000;
    const int runs = 11;

    struct Cfg { float a, b; const char* name; } cfgs[] = {
        {150, 100, "Moderate (150,100)"},
        {200,  50, "High ecc (200,50)"},
        {100,  10, "Very high (100,10)"},
        {  1, 100, "Extreme (1,100)"},
        {100,   1, "Extreme (100,1)"},
        {  1,1000, "Extreme (1,1000)"},
        {100, 100, "Circle (100,100)"},
    };

    std::printf("====================================================\n");
    std::printf("mine2 vs opt3 vs claude_rot4 (float)\n");
    std::printf("N=%d points, median(%d) runs\n", N, runs);
    std::printf("Build: g++ -O3 -march=native -ffast-math -std=c++17\n");
    std::printf("====================================================\n\n");

    for (const auto& cfg : cfgs) {
        std::vector<Point> pts;
        pts.reserve(N);
        for (int i = 0; i < N; ++i) {
            float ang = angle(rng);
            float rr  = radius(rng);
            pts.push_back({ cfg.a '' rr '' std::cos(ang), cfg.b '' rr '' std::sin(ang) });
        }

        double t_mine = bench(mine2, cfg.a, cfg.b, pts, runs);
        double t_opt3 = bench(opt3,  cfg.a, cfg.b, pts, runs);
        double t_cl   = bench(claude_rot4, cfg.a, cfg.b, pts, runs);

        float e_mine = ellipse_eq_maxerr<mine2>(cfg.a, cfg.b, pts);
        float e_opt  = ellipse_eq_maxerr<opt3>(cfg.a, cfg.b, pts);
        float e_cl   = ellipse_eq_maxerr<claude_rot4>(cfg.a, cfg.b, pts);

        std::printf("%s\n", cfg.name);
        std::printf("  mine2       %7.2f ns/call   max(|eq-1|)=%.2e\n", t_mine, e_mine);
        std::printf("  opt3        %7.2f ns/call   max(|eq-1|)=%.2e\n", t_opt3, e_opt);
        std::printf("  claude_rot4 %7.2f ns/call   max(|eq-1|)=%.2e\n", t_cl, e_cl);
        std::printf("\n");
    }

    return 0;
}

</syntaxhighlight>