forked from Shifty/pyserveX
554 lines
17 KiB
Python
554 lines
17 KiB
Python
"""Process Manager Module
|
|
|
|
Orchestrates ASGI/WSGI applications as separate processes
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import signal
|
|
import socket
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from .logging_utils import get_logger
|
|
|
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class ProcessState(Enum):
|
|
PENDING = "pending"
|
|
STARTING = "starting"
|
|
RUNNING = "running"
|
|
STOPPING = "stopping"
|
|
STOPPED = "stopped"
|
|
FAILED = "failed"
|
|
RESTARTING = "restarting"
|
|
|
|
|
|
@dataclass
|
|
class ProcessConfig:
|
|
name: str
|
|
app_path: str
|
|
app_type: str = "asgi" # asgi, wsgi
|
|
host: str = "127.0.0.1"
|
|
port: int = 0 # 0 = auto-assign
|
|
workers: int = 1
|
|
module_path: Optional[str] = None
|
|
factory: bool = False
|
|
factory_args: Optional[Dict[str, Any]] = None
|
|
env: Dict[str, str] = field(default_factory=dict)
|
|
|
|
health_check_enabled: bool = True
|
|
health_check_path: str = "/health"
|
|
health_check_interval: float = 10.0
|
|
health_check_timeout: float = 5.0
|
|
health_check_retries: int = 3
|
|
|
|
max_memory_mb: Optional[int] = None
|
|
max_restart_count: int = 5
|
|
restart_delay: float = 1.0 # seconds
|
|
|
|
shutdown_timeout: float = 30.0 # seconds
|
|
|
|
|
|
@dataclass
|
|
class ProcessInfo:
|
|
config: ProcessConfig
|
|
state: ProcessState = ProcessState.PENDING
|
|
pid: Optional[int] = None
|
|
port: int = 0
|
|
start_time: Optional[float] = None
|
|
restart_count: int = 0
|
|
last_health_check: Optional[float] = None
|
|
health_check_failures: int = 0
|
|
process: Optional[subprocess.Popen] = None
|
|
|
|
@property
|
|
def uptime(self) -> float:
|
|
if self.start_time is None:
|
|
return 0.0
|
|
return time.time() - self.start_time
|
|
|
|
@property
|
|
def is_running(self) -> bool:
|
|
return self.state == ProcessState.RUNNING and self.process is not None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"name": self.config.name,
|
|
"state": self.state.value,
|
|
"pid": self.pid,
|
|
"port": self.port,
|
|
"uptime": round(self.uptime, 2),
|
|
"restart_count": self.restart_count,
|
|
"health_check_failures": self.health_check_failures,
|
|
"workers": self.config.workers,
|
|
}
|
|
|
|
|
|
class PortAllocator:
|
|
def __init__(self, start_port: int = 9000, end_port: int = 9999):
|
|
self.start_port = start_port
|
|
self.end_port = end_port
|
|
self._allocated: set[int] = set()
|
|
self._lock = asyncio.Lock()
|
|
|
|
async def allocate(self) -> int:
|
|
async with self._lock:
|
|
for port in range(self.start_port, self.end_port + 1):
|
|
if port in self._allocated:
|
|
continue
|
|
if self._is_port_available(port):
|
|
self._allocated.add(port)
|
|
return port
|
|
raise RuntimeError(f"No available ports in range {self.start_port}-{self.end_port}")
|
|
|
|
async def release(self, port: int) -> None:
|
|
async with self._lock:
|
|
self._allocated.discard(port)
|
|
|
|
def _is_port_available(self, port: int) -> bool:
|
|
try:
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
s.bind(("127.0.0.1", port))
|
|
return True
|
|
except OSError:
|
|
return False
|
|
|
|
|
|
class ProcessManager:
|
|
def __init__(
|
|
self,
|
|
port_range: tuple[int, int] = (9000, 9999),
|
|
health_check_enabled: bool = True,
|
|
):
|
|
self._processes: Dict[str, ProcessInfo] = {}
|
|
self._port_allocator = PortAllocator(*port_range)
|
|
self._health_check_enabled = health_check_enabled
|
|
self._health_check_task: Optional[asyncio.Task] = None
|
|
self._shutdown_event = asyncio.Event()
|
|
self._started = False
|
|
self._lock = asyncio.Lock()
|
|
|
|
async def start(self) -> None:
|
|
if self._started:
|
|
return
|
|
|
|
self._started = True
|
|
self._shutdown_event.clear()
|
|
|
|
if self._health_check_enabled:
|
|
self._health_check_task = asyncio.create_task(self._health_check_loop(), name="process_manager_health_check")
|
|
|
|
logger.info("Process manager started")
|
|
|
|
async def stop(self) -> None:
|
|
if not self._started:
|
|
return
|
|
|
|
logger.info("Stopping process manager...")
|
|
self._shutdown_event.set()
|
|
|
|
if self._health_check_task:
|
|
self._health_check_task.cancel()
|
|
try:
|
|
await self._health_check_task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
|
|
await self.stop_all()
|
|
|
|
self._started = False
|
|
logger.info("Process manager stopped")
|
|
|
|
async def register(self, config: ProcessConfig) -> ProcessInfo:
|
|
async with self._lock:
|
|
if config.name in self._processes:
|
|
raise ValueError(f"Process '{config.name}' already registered")
|
|
|
|
info = ProcessInfo(config=config)
|
|
self._processes[config.name] = info
|
|
|
|
logger.info(f"Registered process '{config.name}'", app_path=config.app_path)
|
|
return info
|
|
|
|
async def unregister(self, name: str) -> None:
|
|
async with self._lock:
|
|
if name not in self._processes:
|
|
return
|
|
|
|
info = self._processes[name]
|
|
if info.is_running:
|
|
await self._stop_process(info)
|
|
|
|
if info.port:
|
|
await self._port_allocator.release(info.port)
|
|
|
|
del self._processes[name]
|
|
logger.info(f"Unregistered process '{name}'")
|
|
|
|
async def start_process(self, name: str) -> bool:
|
|
info = self._processes.get(name)
|
|
if not info:
|
|
logger.error(f"Process '{name}' not found")
|
|
return False
|
|
|
|
if info.is_running:
|
|
logger.warning(f"Process '{name}' is already running")
|
|
return True
|
|
|
|
return await self._start_process(info)
|
|
|
|
async def stop_process(self, name: str) -> bool:
|
|
info = self._processes.get(name)
|
|
if not info:
|
|
logger.error(f"Process '{name}' not found")
|
|
return False
|
|
|
|
return await self._stop_process(info)
|
|
|
|
async def restart_process(self, name: str) -> bool:
|
|
info = self._processes.get(name)
|
|
if not info:
|
|
logger.error(f"Process '{name}' not found")
|
|
return False
|
|
|
|
info.state = ProcessState.RESTARTING
|
|
|
|
if info.is_running:
|
|
await self._stop_process(info)
|
|
|
|
await asyncio.sleep(info.config.restart_delay)
|
|
return await self._start_process(info)
|
|
|
|
async def start_all(self) -> Dict[str, bool]:
|
|
results = {}
|
|
for name in self._processes:
|
|
results[name] = await self.start_process(name)
|
|
return results
|
|
|
|
async def stop_all(self) -> None:
|
|
tasks = []
|
|
for info in self._processes.values():
|
|
if info.is_running:
|
|
tasks.append(self._stop_process(info))
|
|
|
|
if tasks:
|
|
await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
def get_process(self, name: str) -> Optional[ProcessInfo]:
|
|
return self._processes.get(name)
|
|
|
|
def get_all_processes(self) -> Dict[str, ProcessInfo]:
|
|
return self._processes.copy()
|
|
|
|
def get_process_by_port(self, port: int) -> Optional[ProcessInfo]:
|
|
for info in self._processes.values():
|
|
if info.port == port:
|
|
return info
|
|
return None
|
|
|
|
def get_upstream_url(self, name: str) -> Optional[str]:
|
|
info = self._processes.get(name)
|
|
if not info or not info.is_running:
|
|
return None
|
|
return f"http://{info.config.host}:{info.port}"
|
|
|
|
async def _start_process(self, info: ProcessInfo) -> bool:
|
|
config = info.config
|
|
|
|
try:
|
|
info.state = ProcessState.STARTING
|
|
|
|
if info.port == 0:
|
|
info.port = await self._port_allocator.allocate()
|
|
|
|
cmd = self._build_command(config, info.port)
|
|
|
|
env = os.environ.copy()
|
|
env.update(config.env)
|
|
|
|
if config.module_path:
|
|
python_path = env.get("PYTHONPATH", "")
|
|
module_dir = str(Path(config.module_path).resolve())
|
|
env["PYTHONPATH"] = f"{module_dir}:{python_path}" if python_path else module_dir
|
|
|
|
# For WSGI apps, pass configuration via environment variables
|
|
if config.app_type == "wsgi":
|
|
env["PYSERVE_WSGI_APP"] = config.app_path
|
|
env["PYSERVE_WSGI_FACTORY"] = "1" if config.factory else "0"
|
|
|
|
logger.info(
|
|
f"Starting process '{config.name}'",
|
|
command=" ".join(cmd),
|
|
port=info.port,
|
|
)
|
|
|
|
info.process = subprocess.Popen(
|
|
cmd,
|
|
env=env,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
preexec_fn=os.setsid if hasattr(os, "setsid") else None,
|
|
)
|
|
|
|
info.pid = info.process.pid
|
|
info.start_time = time.time()
|
|
|
|
if not await self._wait_for_ready(info):
|
|
raise RuntimeError(f"Process '{config.name}' failed to start")
|
|
|
|
info.state = ProcessState.RUNNING
|
|
logger.info(
|
|
f"Process '{config.name}' started successfully",
|
|
pid=info.pid,
|
|
port=info.port,
|
|
)
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to start process '{config.name}': {e}")
|
|
info.state = ProcessState.FAILED
|
|
if info.port:
|
|
await self._port_allocator.release(info.port)
|
|
info.port = 0
|
|
return False
|
|
|
|
async def _stop_process(self, info: ProcessInfo) -> bool:
|
|
if not info.process:
|
|
info.state = ProcessState.STOPPED
|
|
return True
|
|
|
|
config = info.config
|
|
info.state = ProcessState.STOPPING
|
|
|
|
try:
|
|
if hasattr(os, "killpg"):
|
|
try:
|
|
os.killpg(os.getpgid(info.process.pid), signal.SIGTERM)
|
|
except ProcessLookupError:
|
|
pass
|
|
else:
|
|
info.process.terminate()
|
|
|
|
try:
|
|
await asyncio.wait_for(asyncio.get_event_loop().run_in_executor(None, info.process.wait), timeout=config.shutdown_timeout)
|
|
except asyncio.TimeoutError:
|
|
logger.warning(f"Process '{config.name}' did not stop gracefully, forcing kill")
|
|
if hasattr(os, "killpg"):
|
|
try:
|
|
os.killpg(os.getpgid(info.process.pid), signal.SIGKILL)
|
|
except ProcessLookupError:
|
|
pass
|
|
else:
|
|
info.process.kill()
|
|
info.process.wait()
|
|
|
|
if info.port:
|
|
await self._port_allocator.release(info.port)
|
|
|
|
info.state = ProcessState.STOPPED
|
|
info.process = None
|
|
info.pid = None
|
|
|
|
logger.info(f"Process '{config.name}' stopped")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error stopping process '{config.name}': {e}")
|
|
info.state = ProcessState.FAILED
|
|
return False
|
|
|
|
async def _wait_for_ready(self, info: ProcessInfo, timeout: float = 30.0) -> bool:
|
|
import httpx
|
|
|
|
start_time = time.time()
|
|
url = f"http://{info.config.host}:{info.port}{info.config.health_check_path}"
|
|
|
|
while time.time() - start_time < timeout:
|
|
if info.process and info.process.poll() is not None:
|
|
stdout, stderr = info.process.communicate()
|
|
logger.error(
|
|
f"Process '{info.config.name}' exited during startup",
|
|
returncode=info.process.returncode,
|
|
stderr=stderr.decode() if stderr else "",
|
|
)
|
|
return False
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=2.0) as client:
|
|
resp = await client.get(url)
|
|
if resp.status_code < 500:
|
|
return True
|
|
except Exception:
|
|
pass
|
|
|
|
await asyncio.sleep(0.5)
|
|
|
|
return False
|
|
|
|
async def _health_check_loop(self) -> None:
|
|
while not self._shutdown_event.is_set():
|
|
try:
|
|
for info in list(self._processes.values()):
|
|
if not info.is_running or not info.config.health_check_enabled:
|
|
continue
|
|
|
|
await self._check_process_health(info)
|
|
|
|
try:
|
|
await asyncio.wait_for(
|
|
self._shutdown_event.wait(),
|
|
timeout=(
|
|
min(p.config.health_check_interval for p in self._processes.values() if p.config.health_check_enabled)
|
|
if self._processes
|
|
else 10.0
|
|
),
|
|
)
|
|
break
|
|
except asyncio.TimeoutError:
|
|
pass
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in health check loop: {e}")
|
|
await asyncio.sleep(5)
|
|
|
|
async def _check_process_health(self, info: ProcessInfo) -> bool:
|
|
import httpx
|
|
|
|
config = info.config
|
|
url = f"http://{config.host}:{info.port}{config.health_check_path}"
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=config.health_check_timeout) as client:
|
|
resp = await client.get(url)
|
|
if resp.status_code < 500:
|
|
info.health_check_failures = 0
|
|
info.last_health_check = time.time()
|
|
return True
|
|
else:
|
|
raise Exception(f"Health check returned status {resp.status_code}")
|
|
|
|
except Exception as e:
|
|
info.health_check_failures += 1
|
|
logger.warning(
|
|
f"Health check failed for '{config.name}'",
|
|
failures=info.health_check_failures,
|
|
error=str(e),
|
|
)
|
|
|
|
if info.health_check_failures >= config.health_check_retries:
|
|
logger.error(f"Process '{config.name}' is unhealthy, restarting...")
|
|
await self._handle_unhealthy_process(info)
|
|
|
|
return False
|
|
|
|
async def _handle_unhealthy_process(self, info: ProcessInfo) -> None:
|
|
config = info.config
|
|
|
|
if info.restart_count >= config.max_restart_count:
|
|
logger.error(f"Process '{config.name}' exceeded max restart count, marking as failed")
|
|
info.state = ProcessState.FAILED
|
|
return
|
|
|
|
info.restart_count += 1
|
|
info.health_check_failures = 0
|
|
|
|
delay = config.restart_delay * (2 ** (info.restart_count - 1))
|
|
delay = min(delay, 60.0)
|
|
|
|
logger.info(
|
|
f"Restarting process '{config.name}'",
|
|
restart_count=info.restart_count,
|
|
delay=delay,
|
|
)
|
|
|
|
await self._stop_process(info)
|
|
await asyncio.sleep(delay)
|
|
await self._start_process(info)
|
|
|
|
def _build_command(self, config: ProcessConfig, port: int) -> List[str]:
|
|
if config.app_type == "wsgi":
|
|
wrapper_app = self._create_wsgi_wrapper_path(config)
|
|
app_path = wrapper_app
|
|
else:
|
|
app_path = config.app_path
|
|
|
|
cmd = [
|
|
sys.executable,
|
|
"-m",
|
|
"uvicorn",
|
|
app_path,
|
|
"--host",
|
|
config.host,
|
|
"--port",
|
|
str(port),
|
|
"--workers",
|
|
str(config.workers),
|
|
"--log-level",
|
|
"warning",
|
|
"--no-access-log",
|
|
]
|
|
|
|
if config.factory and config.app_type != "wsgi":
|
|
cmd.append("--factory")
|
|
|
|
return cmd
|
|
|
|
def _create_wsgi_wrapper_path(self, config: ProcessConfig) -> str:
|
|
"""
|
|
Since uvicorn can't directly run WSGI apps, we create a wrapper
|
|
that imports the WSGI app and wraps it with a2wsgi.
|
|
"""
|
|
# For WSGI apps, we'll use a special wrapper module
|
|
# The wrapper is: pyserve._wsgi_wrapper:create_app
|
|
# It will be called with app_path as environment variable
|
|
return "pyserve._wsgi_wrapper:app"
|
|
|
|
def get_metrics(self) -> Dict[str, Any]:
|
|
return {
|
|
"managed_processes": len(self._processes),
|
|
"running_processes": sum(1 for p in self._processes.values() if p.is_running),
|
|
"processes": {name: info.to_dict() for name, info in self._processes.items()},
|
|
}
|
|
|
|
|
|
_process_manager: Optional[ProcessManager] = None
|
|
|
|
|
|
def get_process_manager() -> ProcessManager:
|
|
global _process_manager
|
|
if _process_manager is None:
|
|
_process_manager = ProcessManager()
|
|
return _process_manager
|
|
|
|
|
|
async def init_process_manager(
|
|
port_range: tuple[int, int] = (9000, 9999),
|
|
health_check_enabled: bool = True,
|
|
) -> ProcessManager:
|
|
global _process_manager
|
|
_process_manager = ProcessManager(
|
|
port_range=port_range,
|
|
health_check_enabled=health_check_enabled,
|
|
)
|
|
await _process_manager.start()
|
|
return _process_manager
|
|
|
|
|
|
async def shutdown_process_manager() -> None:
|
|
global _process_manager
|
|
if _process_manager:
|
|
await _process_manager.stop()
|
|
_process_manager = None
|