konduktor/pyserve/process_manager.py
2025-12-04 01:25:13 +03:00

554 lines
17 KiB
Python

"""Process Manager Module
Orchestrates ASGI/WSGI applications as separate processes
"""
import asyncio
import logging
import os
import signal
import socket
import subprocess
import sys
import time
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional
from .logging_utils import get_logger
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
logger = get_logger(__name__)
class ProcessState(Enum):
PENDING = "pending"
STARTING = "starting"
RUNNING = "running"
STOPPING = "stopping"
STOPPED = "stopped"
FAILED = "failed"
RESTARTING = "restarting"
@dataclass
class ProcessConfig:
name: str
app_path: str
app_type: str = "asgi" # asgi, wsgi
host: str = "127.0.0.1"
port: int = 0 # 0 = auto-assign
workers: int = 1
module_path: Optional[str] = None
factory: bool = False
factory_args: Optional[Dict[str, Any]] = None
env: Dict[str, str] = field(default_factory=dict)
health_check_enabled: bool = True
health_check_path: str = "/health"
health_check_interval: float = 10.0
health_check_timeout: float = 5.0
health_check_retries: int = 3
max_memory_mb: Optional[int] = None
max_restart_count: int = 5
restart_delay: float = 1.0 # seconds
shutdown_timeout: float = 30.0 # seconds
@dataclass
class ProcessInfo:
config: ProcessConfig
state: ProcessState = ProcessState.PENDING
pid: Optional[int] = None
port: int = 0
start_time: Optional[float] = None
restart_count: int = 0
last_health_check: Optional[float] = None
health_check_failures: int = 0
process: Optional[subprocess.Popen] = None
@property
def uptime(self) -> float:
if self.start_time is None:
return 0.0
return time.time() - self.start_time
@property
def is_running(self) -> bool:
return self.state == ProcessState.RUNNING and self.process is not None
def to_dict(self) -> Dict[str, Any]:
return {
"name": self.config.name,
"state": self.state.value,
"pid": self.pid,
"port": self.port,
"uptime": round(self.uptime, 2),
"restart_count": self.restart_count,
"health_check_failures": self.health_check_failures,
"workers": self.config.workers,
}
class PortAllocator:
def __init__(self, start_port: int = 9000, end_port: int = 9999):
self.start_port = start_port
self.end_port = end_port
self._allocated: set[int] = set()
self._lock = asyncio.Lock()
async def allocate(self) -> int:
async with self._lock:
for port in range(self.start_port, self.end_port + 1):
if port in self._allocated:
continue
if self._is_port_available(port):
self._allocated.add(port)
return port
raise RuntimeError(f"No available ports in range {self.start_port}-{self.end_port}")
async def release(self, port: int) -> None:
async with self._lock:
self._allocated.discard(port)
def _is_port_available(self, port: int) -> bool:
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
s.bind(("127.0.0.1", port))
return True
except OSError:
return False
class ProcessManager:
def __init__(
self,
port_range: tuple[int, int] = (9000, 9999),
health_check_enabled: bool = True,
):
self._processes: Dict[str, ProcessInfo] = {}
self._port_allocator = PortAllocator(*port_range)
self._health_check_enabled = health_check_enabled
self._health_check_task: Optional[asyncio.Task] = None
self._shutdown_event = asyncio.Event()
self._started = False
self._lock = asyncio.Lock()
async def start(self) -> None:
if self._started:
return
self._started = True
self._shutdown_event.clear()
if self._health_check_enabled:
self._health_check_task = asyncio.create_task(self._health_check_loop(), name="process_manager_health_check")
logger.info("Process manager started")
async def stop(self) -> None:
if not self._started:
return
logger.info("Stopping process manager...")
self._shutdown_event.set()
if self._health_check_task:
self._health_check_task.cancel()
try:
await self._health_check_task
except asyncio.CancelledError:
pass
await self.stop_all()
self._started = False
logger.info("Process manager stopped")
async def register(self, config: ProcessConfig) -> ProcessInfo:
async with self._lock:
if config.name in self._processes:
raise ValueError(f"Process '{config.name}' already registered")
info = ProcessInfo(config=config)
self._processes[config.name] = info
logger.info(f"Registered process '{config.name}'", app_path=config.app_path)
return info
async def unregister(self, name: str) -> None:
async with self._lock:
if name not in self._processes:
return
info = self._processes[name]
if info.is_running:
await self._stop_process(info)
if info.port:
await self._port_allocator.release(info.port)
del self._processes[name]
logger.info(f"Unregistered process '{name}'")
async def start_process(self, name: str) -> bool:
info = self._processes.get(name)
if not info:
logger.error(f"Process '{name}' not found")
return False
if info.is_running:
logger.warning(f"Process '{name}' is already running")
return True
return await self._start_process(info)
async def stop_process(self, name: str) -> bool:
info = self._processes.get(name)
if not info:
logger.error(f"Process '{name}' not found")
return False
return await self._stop_process(info)
async def restart_process(self, name: str) -> bool:
info = self._processes.get(name)
if not info:
logger.error(f"Process '{name}' not found")
return False
info.state = ProcessState.RESTARTING
if info.is_running:
await self._stop_process(info)
await asyncio.sleep(info.config.restart_delay)
return await self._start_process(info)
async def start_all(self) -> Dict[str, bool]:
results = {}
for name in self._processes:
results[name] = await self.start_process(name)
return results
async def stop_all(self) -> None:
tasks = []
for info in self._processes.values():
if info.is_running:
tasks.append(self._stop_process(info))
if tasks:
await asyncio.gather(*tasks, return_exceptions=True)
def get_process(self, name: str) -> Optional[ProcessInfo]:
return self._processes.get(name)
def get_all_processes(self) -> Dict[str, ProcessInfo]:
return self._processes.copy()
def get_process_by_port(self, port: int) -> Optional[ProcessInfo]:
for info in self._processes.values():
if info.port == port:
return info
return None
def get_upstream_url(self, name: str) -> Optional[str]:
info = self._processes.get(name)
if not info or not info.is_running:
return None
return f"http://{info.config.host}:{info.port}"
async def _start_process(self, info: ProcessInfo) -> bool:
config = info.config
try:
info.state = ProcessState.STARTING
if info.port == 0:
info.port = await self._port_allocator.allocate()
cmd = self._build_command(config, info.port)
env = os.environ.copy()
env.update(config.env)
if config.module_path:
python_path = env.get("PYTHONPATH", "")
module_dir = str(Path(config.module_path).resolve())
env["PYTHONPATH"] = f"{module_dir}:{python_path}" if python_path else module_dir
# For WSGI apps, pass configuration via environment variables
if config.app_type == "wsgi":
env["PYSERVE_WSGI_APP"] = config.app_path
env["PYSERVE_WSGI_FACTORY"] = "1" if config.factory else "0"
logger.info(
f"Starting process '{config.name}'",
command=" ".join(cmd),
port=info.port,
)
info.process = subprocess.Popen(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=os.setsid if hasattr(os, "setsid") else None,
)
info.pid = info.process.pid
info.start_time = time.time()
if not await self._wait_for_ready(info):
raise RuntimeError(f"Process '{config.name}' failed to start")
info.state = ProcessState.RUNNING
logger.info(
f"Process '{config.name}' started successfully",
pid=info.pid,
port=info.port,
)
return True
except Exception as e:
logger.error(f"Failed to start process '{config.name}': {e}")
info.state = ProcessState.FAILED
if info.port:
await self._port_allocator.release(info.port)
info.port = 0
return False
async def _stop_process(self, info: ProcessInfo) -> bool:
if not info.process:
info.state = ProcessState.STOPPED
return True
config = info.config
info.state = ProcessState.STOPPING
try:
if hasattr(os, "killpg"):
try:
os.killpg(os.getpgid(info.process.pid), signal.SIGTERM)
except ProcessLookupError:
pass
else:
info.process.terminate()
try:
await asyncio.wait_for(asyncio.get_event_loop().run_in_executor(None, info.process.wait), timeout=config.shutdown_timeout)
except asyncio.TimeoutError:
logger.warning(f"Process '{config.name}' did not stop gracefully, forcing kill")
if hasattr(os, "killpg"):
try:
os.killpg(os.getpgid(info.process.pid), signal.SIGKILL)
except ProcessLookupError:
pass
else:
info.process.kill()
info.process.wait()
if info.port:
await self._port_allocator.release(info.port)
info.state = ProcessState.STOPPED
info.process = None
info.pid = None
logger.info(f"Process '{config.name}' stopped")
return True
except Exception as e:
logger.error(f"Error stopping process '{config.name}': {e}")
info.state = ProcessState.FAILED
return False
async def _wait_for_ready(self, info: ProcessInfo, timeout: float = 30.0) -> bool:
import httpx
start_time = time.time()
url = f"http://{info.config.host}:{info.port}{info.config.health_check_path}"
while time.time() - start_time < timeout:
if info.process and info.process.poll() is not None:
stdout, stderr = info.process.communicate()
logger.error(
f"Process '{info.config.name}' exited during startup",
returncode=info.process.returncode,
stderr=stderr.decode() if stderr else "",
)
return False
try:
async with httpx.AsyncClient(timeout=2.0) as client:
resp = await client.get(url)
if resp.status_code < 500:
return True
except Exception:
pass
await asyncio.sleep(0.5)
return False
async def _health_check_loop(self) -> None:
while not self._shutdown_event.is_set():
try:
for info in list(self._processes.values()):
if not info.is_running or not info.config.health_check_enabled:
continue
await self._check_process_health(info)
try:
await asyncio.wait_for(
self._shutdown_event.wait(),
timeout=(
min(p.config.health_check_interval for p in self._processes.values() if p.config.health_check_enabled)
if self._processes
else 10.0
),
)
break
except asyncio.TimeoutError:
pass
except Exception as e:
logger.error(f"Error in health check loop: {e}")
await asyncio.sleep(5)
async def _check_process_health(self, info: ProcessInfo) -> bool:
import httpx
config = info.config
url = f"http://{config.host}:{info.port}{config.health_check_path}"
try:
async with httpx.AsyncClient(timeout=config.health_check_timeout) as client:
resp = await client.get(url)
if resp.status_code < 500:
info.health_check_failures = 0
info.last_health_check = time.time()
return True
else:
raise Exception(f"Health check returned status {resp.status_code}")
except Exception as e:
info.health_check_failures += 1
logger.warning(
f"Health check failed for '{config.name}'",
failures=info.health_check_failures,
error=str(e),
)
if info.health_check_failures >= config.health_check_retries:
logger.error(f"Process '{config.name}' is unhealthy, restarting...")
await self._handle_unhealthy_process(info)
return False
async def _handle_unhealthy_process(self, info: ProcessInfo) -> None:
config = info.config
if info.restart_count >= config.max_restart_count:
logger.error(f"Process '{config.name}' exceeded max restart count, marking as failed")
info.state = ProcessState.FAILED
return
info.restart_count += 1
info.health_check_failures = 0
delay = config.restart_delay * (2 ** (info.restart_count - 1))
delay = min(delay, 60.0)
logger.info(
f"Restarting process '{config.name}'",
restart_count=info.restart_count,
delay=delay,
)
await self._stop_process(info)
await asyncio.sleep(delay)
await self._start_process(info)
def _build_command(self, config: ProcessConfig, port: int) -> List[str]:
if config.app_type == "wsgi":
wrapper_app = self._create_wsgi_wrapper_path(config)
app_path = wrapper_app
else:
app_path = config.app_path
cmd = [
sys.executable,
"-m",
"uvicorn",
app_path,
"--host",
config.host,
"--port",
str(port),
"--workers",
str(config.workers),
"--log-level",
"warning",
"--no-access-log",
]
if config.factory and config.app_type != "wsgi":
cmd.append("--factory")
return cmd
def _create_wsgi_wrapper_path(self, config: ProcessConfig) -> str:
"""
Since uvicorn can't directly run WSGI apps, we create a wrapper
that imports the WSGI app and wraps it with a2wsgi.
"""
# For WSGI apps, we'll use a special wrapper module
# The wrapper is: pyserve._wsgi_wrapper:create_app
# It will be called with app_path as environment variable
return "pyserve._wsgi_wrapper:app"
def get_metrics(self) -> Dict[str, Any]:
return {
"managed_processes": len(self._processes),
"running_processes": sum(1 for p in self._processes.values() if p.is_running),
"processes": {name: info.to_dict() for name, info in self._processes.items()},
}
_process_manager: Optional[ProcessManager] = None
def get_process_manager() -> ProcessManager:
global _process_manager
if _process_manager is None:
_process_manager = ProcessManager()
return _process_manager
async def init_process_manager(
port_range: tuple[int, int] = (9000, 9999),
health_check_enabled: bool = True,
) -> ProcessManager:
global _process_manager
_process_manager = ProcessManager(
port_range=port_range,
health_check_enabled=health_check_enabled,
)
await _process_manager.start()
return _process_manager
async def shutdown_process_manager() -> None:
global _process_manager
if _process_manager:
await _process_manager.stop()
_process_manager = None