Editing Openai/694057b6-101c-8007-9a65-c40578c7252d (section)

==== ### ====

<syntaxhighlight lang="python">from dataclasses import dataclass
from typing import Optional, Dict, List

@dataclass
class LlamaWorkerSpec:
    name: str
    model_path: str
    port: int
    cuda_visible_devices: str            # e.g. "0" or "2,3"
    host: str = "127.0.0.1"
    extra_args: Optional[List[str]] = None
    env_extra: Optional[Dict[str, str]] = None

</syntaxhighlight>

===== Use asyncio so you can manage many workers cleanly and stream logs. =====

<syntaxhighlight lang="python">import asyncio, os, signal
from typing import Optional

class LlamaWorker:
    def __init__(self, spec: LlamaWorkerSpec, llama_server_path: str = "llama-server"):
        self.spec = spec
        self.llama_server_path = llama_server_path
        self.proc: Optional[asyncio.subprocess.Process] = None

    async def start(self) -> None:
        if self.proc and self.proc.returncode is None:
            return

        env = os.environ.copy()
        env["CUDA_VISIBLE_DEVICES"] = self.spec.cuda_visible_devices
        if self.spec.env_extra:
            env.update(self.spec.env_extra)

        args = [
            self.llama_server_path,
            "-m", self.spec.model_path,
            "--host", self.spec.host,
            "--port", str(self.spec.port),
        ]
        if self.spec.extra_args:
            args += self.spec.extra_args

        self.proc = await asyncio.create_subprocess_exec(
            *args,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.STDOUT,
            env=env,
        )

        # optional: start a log pump task
        asyncio.create_task(self._pump_logs())

    async def _pump_logs(self) -> None:
        assert self.proc and self.proc.stdout
        while True:
            line = await self.proc.stdout.readline()
            if not line:
                break
            # replace with structured logging
            print(f"[{self.spec.name}] {line.decode(errors='replace').rstrip()}")

    async def stop(self, timeout_s: float = 10.0) -> None:
        if not self.proc or self.proc.returncode is not None:
            return

        # Try graceful first
        self.proc.send_signal(signal.SIGTERM)
        try:
            await asyncio.wait_for(self.proc.wait(), timeout=timeout_s)
        except asyncio.TimeoutError:
            self.proc.kill()
            await self.proc.wait()

    async def wait(self) -> int:
        if not self.proc:
            return 0
        return await self.proc.wait()

</syntaxhighlight>

===== You don’t want to route requests until the worker is actually serving. =====

<syntaxhighlight lang="python">import httpx

async def wait_ready(spec: LlamaWorkerSpec, timeout_s: float = 60.0) -> None:
    url = f"http://{spec.host}:{spec.port}/v1/models"
    async with httpx.AsyncClient(timeout=2.0) as client:
        deadline = asyncio.get_event_loop().time() + timeout_s
        while True:
            try:
                r = await client.get(url)
                if r.status_code == 200:
                    return
            except Exception:
                pass
            if asyncio.get_event_loop().time() > deadline:
                raise RuntimeError(f"Worker {spec.name} not ready after {timeout_s}s")
            await asyncio.sleep(0.25)

</syntaxhighlight>

===== <syntaxhighlight lang="python">async def chat(spec: LlamaWorkerSpec, model_name: str, messages, stream: bool = False): =====
    url = f"http://{spec.host}:{spec.port}/v1/chat/completions"
    payload = {"model": model_name, "messages": messages, "stream": stream}
    async with httpx.AsyncClient(timeout=None) as client:
        if not stream:
            r = await client.post(url, json=payload)
            r.raise_for_status()
            return r.json()
        else:
            # streaming passthrough (SSE-ish)
            async with client.stream("POST", url, json=payload) as resp:
                resp.raise_for_status()
                async for chunk in resp.aiter_bytes():
                    yield chunk

</syntaxhighlight>