konduktor/pyserve/process_manager.py

"""Process Manager Module

Orchestrates ASGI/WSGI applications as separate processes
"""

import asyncio
import logging
import os
import signal
import socket
import subprocess
import sys
import time
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional

from .logging_utils import get_logger

logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)

logger = get_logger(__name__)


class ProcessState(Enum):
    PENDING = "pending"
    STARTING = "starting"
    RUNNING = "running"
    STOPPING = "stopping"
    STOPPED = "stopped"
    FAILED = "failed"
    RESTARTING = "restarting"


@dataclass
class ProcessConfig:
    name: str
    app_path: str
    app_type: str = "asgi"  # asgi, wsgi
    host: str = "127.0.0.1"
    port: int = 0  # 0 = auto-assign
    workers: int = 1
    module_path: Optional[str] = None
    factory: bool = False
    factory_args: Optional[Dict[str, Any]] = None
    env: Dict[str, str] = field(default_factory=dict)

    health_check_enabled: bool = True
    health_check_path: str = "/health"
    health_check_interval: float = 10.0
    health_check_timeout: float = 5.0
    health_check_retries: int = 3

    max_memory_mb: Optional[int] = None
    max_restart_count: int = 5
    restart_delay: float = 1.0  # seconds

    shutdown_timeout: float = 30.0  # seconds


@dataclass
class ProcessInfo:
    config: ProcessConfig
    state: ProcessState = ProcessState.PENDING
    pid: Optional[int] = None
    port: int = 0
    start_time: Optional[float] = None
    restart_count: int = 0
    last_health_check: Optional[float] = None
    health_check_failures: int = 0
    process: Optional[subprocess.Popen] = None

    @property
    def uptime(self) -> float:
        if self.start_time is None:
            return 0.0
        return time.time() - self.start_time

    @property
    def is_running(self) -> bool:
        return self.state == ProcessState.RUNNING and self.process is not None

    def to_dict(self) -> Dict[str, Any]:
        return {
            "name": self.config.name,
            "state": self.state.value,
            "pid": self.pid,
            "port": self.port,
            "uptime": round(self.uptime, 2),
            "restart_count": self.restart_count,
            "health_check_failures": self.health_check_failures,
            "workers": self.config.workers,
        }


class PortAllocator:
    def __init__(self, start_port: int = 9000, end_port: int = 9999):
        self.start_port = start_port
        self.end_port = end_port
        self._allocated: set[int] = set()
        self._lock = asyncio.Lock()

    async def allocate(self) -> int:
        async with self._lock:
            for port in range(self.start_port, self.end_port + 1):
                if port in self._allocated:
                    continue
                if self._is_port_available(port):
                    self._allocated.add(port)
                    return port
            raise RuntimeError(f"No available ports in range {self.start_port}-{self.end_port}")

    async def release(self, port: int) -> None:
        async with self._lock:
            self._allocated.discard(port)

    def _is_port_available(self, port: int) -> bool:
        try:
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                s.bind(("127.0.0.1", port))
                return True
        except OSError:
            return False


class ProcessManager:
    def __init__(
        self,
        port_range: tuple[int, int] = (9000, 9999),
        health_check_enabled: bool = True,
    ):
        self._processes: Dict[str, ProcessInfo] = {}
        self._port_allocator = PortAllocator(*port_range)
        self._health_check_enabled = health_check_enabled
        self._health_check_task: Optional[asyncio.Task] = None
        self._shutdown_event = asyncio.Event()
        self._started = False
        self._lock = asyncio.Lock()

    async def start(self) -> None:
        if self._started:
            return

        self._started = True
        self._shutdown_event.clear()

        if self._health_check_enabled:
            self._health_check_task = asyncio.create_task(self._health_check_loop(), name="process_manager_health_check")

        logger.info("Process manager started")

    async def stop(self) -> None:
        if not self._started:
            return

        logger.info("Stopping process manager...")
        self._shutdown_event.set()

        if self._health_check_task:
            self._health_check_task.cancel()
            try:
                await self._health_check_task
            except asyncio.CancelledError:
                pass

        await self.stop_all()

        self._started = False
        logger.info("Process manager stopped")

    async def register(self, config: ProcessConfig) -> ProcessInfo:
        async with self._lock:
            if config.name in self._processes:
                raise ValueError(f"Process '{config.name}' already registered")

            info = ProcessInfo(config=config)
            self._processes[config.name] = info

            logger.info(f"Registered process '{config.name}'", app_path=config.app_path)
            return info

    async def unregister(self, name: str) -> None:
        async with self._lock:
            if name not in self._processes:
                return

            info = self._processes[name]
            if info.is_running:
                await self._stop_process(info)

            if info.port:
                await self._port_allocator.release(info.port)

            del self._processes[name]
            logger.info(f"Unregistered process '{name}'")

    async def start_process(self, name: str) -> bool:
        info = self._processes.get(name)
        if not info:
            logger.error(f"Process '{name}' not found")
            return False

        if info.is_running:
            logger.warning(f"Process '{name}' is already running")
            return True

        return await self._start_process(info)

    async def stop_process(self, name: str) -> bool:
        info = self._processes.get(name)
        if not info:
            logger.error(f"Process '{name}' not found")
            return False

        return await self._stop_process(info)

    async def restart_process(self, name: str) -> bool:
        info = self._processes.get(name)
        if not info:
            logger.error(f"Process '{name}' not found")
            return False

        info.state = ProcessState.RESTARTING

        if info.is_running:
            await self._stop_process(info)

        await asyncio.sleep(info.config.restart_delay)
        return await self._start_process(info)

    async def start_all(self) -> Dict[str, bool]:
        results = {}
        for name in self._processes:
            results[name] = await self.start_process(name)
        return results

    async def stop_all(self) -> None:
        tasks = []
        for info in self._processes.values():
            if info.is_running:
                tasks.append(self._stop_process(info))

        if tasks:
            await asyncio.gather(*tasks, return_exceptions=True)

    def get_process(self, name: str) -> Optional[ProcessInfo]:
        return self._processes.get(name)

    def get_all_processes(self) -> Dict[str, ProcessInfo]:
        return self._processes.copy()

    def get_process_by_port(self, port: int) -> Optional[ProcessInfo]:
        for info in self._processes.values():
            if info.port == port:
                return info
        return None

    def get_upstream_url(self, name: str) -> Optional[str]:
        info = self._processes.get(name)
        if not info or not info.is_running:
            return None
        return f"http://{info.config.host}:{info.port}"

    async def _start_process(self, info: ProcessInfo) -> bool:
        config = info.config

        try:
            info.state = ProcessState.STARTING

            if info.port == 0:
                info.port = await self._port_allocator.allocate()

            cmd = self._build_command(config, info.port)

            env = os.environ.copy()
            env.update(config.env)

            if config.module_path:
                python_path = env.get("PYTHONPATH", "")
                module_dir = str(Path(config.module_path).resolve())
                env["PYTHONPATH"] = f"{module_dir}:{python_path}" if python_path else module_dir

            # For WSGI apps, pass configuration via environment variables
            if config.app_type == "wsgi":
                env["PYSERVE_WSGI_APP"] = config.app_path
                env["PYSERVE_WSGI_FACTORY"] = "1" if config.factory else "0"

            logger.info(
                f"Starting process '{config.name}'",
                command=" ".join(cmd),
                port=info.port,
            )

            info.process = subprocess.Popen(
                cmd,
                env=env,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                preexec_fn=os.setsid if hasattr(os, "setsid") else None,
            )

            info.pid = info.process.pid
            info.start_time = time.time()

            if not await self._wait_for_ready(info):
                raise RuntimeError(f"Process '{config.name}' failed to start")

            info.state = ProcessState.RUNNING
            logger.info(
                f"Process '{config.name}' started successfully",
                pid=info.pid,
                port=info.port,
            )
            return True

        except Exception as e:
            logger.error(f"Failed to start process '{config.name}': {e}")
            info.state = ProcessState.FAILED
            if info.port:
                await self._port_allocator.release(info.port)
                info.port = 0
            return False

    async def _stop_process(self, info: ProcessInfo) -> bool:
        if not info.process:
            info.state = ProcessState.STOPPED
            return True

        config = info.config
        info.state = ProcessState.STOPPING

        try:
            if hasattr(os, "killpg"):
                try:
                    os.killpg(os.getpgid(info.process.pid), signal.SIGTERM)
                except ProcessLookupError:
                    pass
            else:
                info.process.terminate()

            try:
                await asyncio.wait_for(asyncio.get_event_loop().run_in_executor(None, info.process.wait), timeout=config.shutdown_timeout)
            except asyncio.TimeoutError:
                logger.warning(f"Process '{config.name}' did not stop gracefully, forcing kill")
                if hasattr(os, "killpg"):
                    try:
                        os.killpg(os.getpgid(info.process.pid), signal.SIGKILL)
                    except ProcessLookupError:
                        pass
                else:
                    info.process.kill()
                info.process.wait()

            if info.port:
                await self._port_allocator.release(info.port)

            info.state = ProcessState.STOPPED
            info.process = None
            info.pid = None

            logger.info(f"Process '{config.name}' stopped")
            return True

        except Exception as e:
            logger.error(f"Error stopping process '{config.name}': {e}")
            info.state = ProcessState.FAILED
            return False

    async def _wait_for_ready(self, info: ProcessInfo, timeout: float = 30.0) -> bool:
        import httpx

        start_time = time.time()
        url = f"http://{info.config.host}:{info.port}{info.config.health_check_path}"

        while time.time() - start_time < timeout:
            if info.process and info.process.poll() is not None:
                stdout, stderr = info.process.communicate()
                logger.error(
                    f"Process '{info.config.name}' exited during startup",
                    returncode=info.process.returncode,
                    stderr=stderr.decode() if stderr else "",
                )
                return False

            try:
                async with httpx.AsyncClient(timeout=2.0) as client:
                    resp = await client.get(url)
                    if resp.status_code < 500:
                        return True
            except Exception:
                pass

            await asyncio.sleep(0.5)

        return False

    async def _health_check_loop(self) -> None:
        while not self._shutdown_event.is_set():
            try:
                for info in list(self._processes.values()):
                    if not info.is_running or not info.config.health_check_enabled:
                        continue

                    await self._check_process_health(info)

                try:
                    await asyncio.wait_for(
                        self._shutdown_event.wait(),
                        timeout=(
                            min(p.config.health_check_interval for p in self._processes.values() if p.config.health_check_enabled)
                            if self._processes
                            else 10.0
                        ),
                    )
                    break
                except asyncio.TimeoutError:
                    pass

            except Exception as e:
                logger.error(f"Error in health check loop: {e}")
                await asyncio.sleep(5)

    async def _check_process_health(self, info: ProcessInfo) -> bool:
        import httpx

        config = info.config
        url = f"http://{config.host}:{info.port}{config.health_check_path}"

        try:
            async with httpx.AsyncClient(timeout=config.health_check_timeout) as client:
                resp = await client.get(url)
                if resp.status_code < 500:
                    info.health_check_failures = 0
                    info.last_health_check = time.time()
                    return True
                else:
                    raise Exception(f"Health check returned status {resp.status_code}")

        except Exception as e:
            info.health_check_failures += 1
            logger.warning(
                f"Health check failed for '{config.name}'",
                failures=info.health_check_failures,
                error=str(e),
            )

            if info.health_check_failures >= config.health_check_retries:
                logger.error(f"Process '{config.name}' is unhealthy, restarting...")
                await self._handle_unhealthy_process(info)

            return False

    async def _handle_unhealthy_process(self, info: ProcessInfo) -> None:
        config = info.config

        if info.restart_count >= config.max_restart_count:
            logger.error(f"Process '{config.name}' exceeded max restart count, marking as failed")
            info.state = ProcessState.FAILED
            return

        info.restart_count += 1
        info.health_check_failures = 0

        delay = config.restart_delay * (2 ** (info.restart_count - 1))
        delay = min(delay, 60.0)

        logger.info(
            f"Restarting process '{config.name}'",
            restart_count=info.restart_count,
            delay=delay,
        )

        await self._stop_process(info)
        await asyncio.sleep(delay)
        await self._start_process(info)

    def _build_command(self, config: ProcessConfig, port: int) -> List[str]:
        if config.app_type == "wsgi":
            wrapper_app = self._create_wsgi_wrapper_path(config)
            app_path = wrapper_app
        else:
            app_path = config.app_path

        cmd = [
            sys.executable,
            "-m",
            "uvicorn",
            app_path,
            "--host",
            config.host,
            "--port",
            str(port),
            "--workers",
            str(config.workers),
            "--log-level",
            "warning",
            "--no-access-log",
        ]

        if config.factory and config.app_type != "wsgi":
            cmd.append("--factory")

        return cmd

    def _create_wsgi_wrapper_path(self, config: ProcessConfig) -> str:
        """
        Since uvicorn can't directly run WSGI apps, we create a wrapper
        that imports the WSGI app and wraps it with a2wsgi.
        """
        # For WSGI apps, we'll use a special wrapper module
        # The wrapper is: pyserve._wsgi_wrapper:create_app
        # It will be called with app_path as environment variable
        return "pyserve._wsgi_wrapper:app"

    def get_metrics(self) -> Dict[str, Any]:
        return {
            "managed_processes": len(self._processes),
            "running_processes": sum(1 for p in self._processes.values() if p.is_running),
            "processes": {name: info.to_dict() for name, info in self._processes.items()},
        }


_process_manager: Optional[ProcessManager] = None


def get_process_manager() -> ProcessManager:
    global _process_manager
    if _process_manager is None:
        _process_manager = ProcessManager()
    return _process_manager


async def init_process_manager(
    port_range: tuple[int, int] = (9000, 9999),
    health_check_enabled: bool = True,
) -> ProcessManager:
    global _process_manager
    _process_manager = ProcessManager(
        port_range=port_range,
        health_check_enabled=health_check_enabled,
    )
    await _process_manager.start()
    return _process_manager


async def shutdown_process_manager() -> None:
    global _process_manager
    if _process_manager:
        await _process_manager.stop()
        _process_manager = None