From 96b81caf261f0e7ce962ead19fadec8cd575cb35 Mon Sep 17 00:00:00 2001 From: dsarno Date: Sun, 4 Jan 2026 14:45:05 -0800 Subject: [PATCH] Codex/implement bounded retry policy for unity (#510) * Add editor readiness v2, refresh tool, and preflight guards * Detect external package changes and harden refresh retry * feat: add TestRunnerNoThrottle and async test running with background stall prevention - Add TestRunnerNoThrottle.cs: Sets editor to 'No Throttling' mode during test runs with SessionState persistence across domain reload - Add run_tests_async and get_test_job tools for non-blocking test execution - Add TestJobManager for async test job tracking with progress monitoring - Add ForceSynchronousImport to all AssetDatabase.Refresh() calls to prevent stalls - Mark DomainReloadResilienceTests as [Explicit] with documentation explaining the test infrastructure limitation (internal coroutine waits vs MCP socket polling) - MCP workflow is unaffected - socket messages provide external stimulus that keeps Unity responsive even when backgrounded * refactor: simplify and clean up code - Remove unused Newtonsoft.Json.Linq import from TestJobManager - Add throttling to SessionState persistence (once per second) to reduce overhead - Critical job state changes (start/finish) still persist immediately - Fix duplicate XML summary tag in DomainReloadResilienceTests * docs: add async test tools to README, document domain reload limitation - Add run_tests_async and get_test_job to main README tools list - Document background stall limitation for domain reload tests in DEV readme * ci: add separate job for domain reload tests Run [Explicit] domain_reload tests in their own job using -testCategory * ci: run domain reload tests in same job as regular tests Combines into single job with two test steps to reuse cached Library * fix: address coderabbit review issues - Fix TOCTOU race in TestJobManager.StartJob (single lock scope for check-and-set) - Store TestRunnerApi reference with HideAndDontSave to prevent GC/serialization issues * docs: update tool descriptions to prefer run_tests_async - run_tests_async is now marked as preferred for long-running suites - run_tests description notes it blocks and suggests async alternative * docs: update README screenshot to v8.6 UI * docs: add v8.6 UI screenshot * docs: update v8.6 UI screenshot * docs: update v8.6 UI screenshot * docs: update v8.6 UI screenshot * Update README for MCP version and instructions for v8.7 * fix: handle preflight busy signals and derive job status from test results - manage_asset, manage_gameobject, manage_scene now check preflight return value and propagate busy/retry signals to clients (fixes Sourcery #1) - TestJobManager.FinalizeCurrentJobFromRunFinished now sets job status to Failed when resultPayload.Failed > 0, not always Succeeded (fixes Sourcery #2) * fix: increase HTTP server startup timeout for dev mode When 'Force fresh server install' is enabled, uvx uses --no-cache --refresh which rebuilds the package and takes significantly longer to start. - Increase timeout from 10s to 45s when dev mode is enabled - Add informative log message explaining the longer startup time - Show actual timeout value in warning message * fix: derive job status from test results in FinalizeFromTask fallback Apply same logic as FinalizeCurrentJobFromRunFinished: check result.Failed > 0 to correctly mark jobs as Failed when tests fail, even in the fallback path when RunFinished callback is not delivered. * Bound Unity reload/session waits * refactor: improve env var parsing and reason extraction Address code review feedback: - Catch ValueError specifically (instead of broad Exception) when parsing UNITY_MCP_RELOAD_MAX_WAIT_S, UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S, and UNITY_MCP_SESSION_READY_WAIT_SECONDS, with logging for easier diagnosis of misconfiguration - Normalize reason values to lowercase in _extract_response_reason() to avoid case-sensitive mismatches in comparisons - Simplify refresh_unity.py by removing redundant isinstance check and reusing _extract_response_reason instead of duplicating reason parsing * Add upper bound clamp and custom exception for bounded retry - Add upper bound (30s) to UNITY_MCP_RELOAD_MAX_WAIT_S to prevent misconfiguration from causing excessive waits - Add upper bound (30s) to UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S for consistency with readiness probe - Introduce NoUnitySessionError custom exception to replace fragile string matching in send_command_for_instance Addresses code review feedback for bounded retry policy PR. --- Server/src/services/tools/refresh_unity.py | 8 +- .../src/transport/legacy/unity_connection.py | 123 +++++++++++++++--- Server/src/transport/plugin_hub.py | 58 +++++++-- 3 files changed, 155 insertions(+), 34 deletions(-) diff --git a/Server/src/services/tools/refresh_unity.py b/Server/src/services/tools/refresh_unity.py index 47dc41e..795dcfa 100644 --- a/Server/src/services/tools/refresh_unity.py +++ b/Server/src/services/tools/refresh_unity.py @@ -10,7 +10,7 @@ from models import MCPResponse from services.registry import mcp_for_unity_tool from services.tools import get_unity_instance_from_context import transport.unity_transport as unity_transport -from transport.legacy.unity_connection import async_send_command_with_retry +from transport.legacy.unity_connection import async_send_command_with_retry, _extract_response_reason from services.state.external_changes_scanner import external_changes_scanner @@ -47,10 +47,12 @@ async def refresh_unity( if isinstance(response, dict) and not response.get("success", True): hint = response.get("hint") err = (response.get("error") or response.get("message") or "") + reason = _extract_response_reason(response) is_retryable = (hint == "retry") or ("disconnected" in str(err).lower()) if (not wait_for_ready) or (not is_retryable): return MCPResponse(**response) - recovered_from_disconnect = True + if reason not in {"reloading", "no_unity_session"}: + recovered_from_disconnect = True # Optional server-side wait loop (defensive): if Unity tool doesn't wait or returns quickly, # poll the canonical editor_state v2 resource until ready or timeout. @@ -86,5 +88,3 @@ async def refresh_unity( ) return MCPResponse(**response) if isinstance(response, dict) else response - - diff --git a/Server/src/transport/legacy/unity_connection.py b/Server/src/transport/legacy/unity_connection.py index f6c3455..9a501db 100644 --- a/Server/src/transport/legacy/unity_connection.py +++ b/Server/src/transport/legacy/unity_connection.py @@ -686,28 +686,46 @@ def get_unity_connection(instance_identifier: str | None = None) -> UnityConnect # Centralized retry helpers # ----------------------------- +def _extract_response_reason(resp: object) -> str | None: + """Extract a normalized (lowercase) reason string from a response. + + Returns lowercase reason values to enable case-insensitive comparisons + by callers (e.g. _is_reloading_response, refresh_unity). + """ + if isinstance(resp, MCPResponse): + data = getattr(resp, "data", None) + if isinstance(data, dict): + reason = data.get("reason") + if isinstance(reason, str): + return reason.lower() + message_text = f"{resp.message or ''} {resp.error or ''}".lower() + if "reload" in message_text: + return "reloading" + return None + + if isinstance(resp, dict): + if resp.get("state") == "reloading": + return "reloading" + data = resp.get("data") + if isinstance(data, dict): + reason = data.get("reason") + if isinstance(reason, str): + return reason.lower() + message_text = (resp.get("message") or resp.get("error") or "").lower() + if "reload" in message_text: + return "reloading" + return None + + return None + + def _is_reloading_response(resp: object) -> bool: """Return True if the Unity response indicates the editor is reloading. Supports both raw dict payloads from Unity and MCPResponse objects returned by preflight checks or transport helpers. """ - # Structured MCPResponse from preflight/transport - if isinstance(resp, MCPResponse): - # Explicit "please retry" hint from preflight - if getattr(resp, "hint", None) == "retry": - return True - message_text = f"{resp.message or ''} {resp.error or ''}".lower() - return "reload" in message_text - - # Raw Unity payloads - if isinstance(resp, dict): - if resp.get("state") == "reloading": - return True - message_text = (resp.get("message") or resp.get("error") or "").lower() - return "reload" in message_text - - return False + return _extract_response_reason(resp) == "reloading" def send_command_with_retry( @@ -738,15 +756,82 @@ def send_command_with_retry( max_retries = getattr(config, "reload_max_retries", 40) if retry_ms is None: retry_ms = getattr(config, "reload_retry_ms", 250) + try: + max_wait_s = float(os.environ.get( + "UNITY_MCP_RELOAD_MAX_WAIT_S", "2.0")) + except ValueError as e: + raw_val = os.environ.get("UNITY_MCP_RELOAD_MAX_WAIT_S", "2.0") + logger.warning( + "Invalid UNITY_MCP_RELOAD_MAX_WAIT_S=%r, using default 2.0: %s", + raw_val, e) + max_wait_s = 2.0 + # Clamp to [0, 30] to prevent misconfiguration from causing excessive waits + max_wait_s = max(0.0, min(max_wait_s, 30.0)) response = conn.send_command(command_type, params) retries = 0 + wait_started = None + reason = _extract_response_reason(response) while _is_reloading_response(response) and retries < max_retries: - delay_ms = int(response.get("retry_after_ms", retry_ms) - ) if isinstance(response, dict) else retry_ms - time.sleep(max(0.0, delay_ms / 1000.0)) + if wait_started is None: + wait_started = time.monotonic() + logger.debug( + "Unity reload wait started: command=%s instance=%s reason=%s max_wait_s=%.2f", + command_type, + instance_id or "default", + reason or "reloading", + max_wait_s, + ) + if max_wait_s <= 0: + break + elapsed = time.monotonic() - wait_started + if elapsed >= max_wait_s: + break + delay_ms = retry_ms + if isinstance(response, dict): + retry_after = response.get("retry_after_ms") + if retry_after is None and isinstance(response.get("data"), dict): + retry_after = response["data"].get("retry_after_ms") + if retry_after is not None: + delay_ms = int(retry_after) + sleep_ms = max(50, min(int(delay_ms), 250)) + logger.debug( + "Unity reload wait retry: command=%s instance=%s reason=%s retry_after_ms=%s sleep_ms=%s", + command_type, + instance_id or "default", + reason or "reloading", + delay_ms, + sleep_ms, + ) + time.sleep(max(0.0, sleep_ms / 1000.0)) retries += 1 response = conn.send_command(command_type, params) + reason = _extract_response_reason(response) + + if wait_started is not None: + waited = time.monotonic() - wait_started + if _is_reloading_response(response): + logger.debug( + "Unity reload wait exceeded budget: command=%s instance=%s waited_s=%.3f", + command_type, + instance_id or "default", + waited, + ) + return MCPResponse( + success=False, + error="Unity is reloading; please retry", + hint="retry", + data={ + "reason": "reloading", + "retry_after_ms": min(250, max(50, retry_ms)), + }, + ) + logger.debug( + "Unity reload wait completed: command=%s instance=%s waited_s=%.3f", + command_type, + instance_id or "default", + waited, + ) return response diff --git a/Server/src/transport/plugin_hub.py b/Server/src/transport/plugin_hub.py index 09ec73f..86cdea8 100644 --- a/Server/src/transport/plugin_hub.py +++ b/Server/src/transport/plugin_hub.py @@ -34,6 +34,10 @@ class PluginDisconnectedError(RuntimeError): """Raised when a plugin WebSocket disconnects while commands are in flight.""" +class NoUnitySessionError(RuntimeError): + """Raised when no Unity plugins are available.""" + + class PluginHub(WebSocketEndpoint): """Manages persistent WebSocket connections to Unity plugins.""" @@ -361,11 +365,20 @@ class PluginHub(WebSocketEndpoint): if cls._registry is None: raise RuntimeError("Plugin registry not configured") - # Use the same defaults as the stdio transport reload handling so that - # HTTP/WebSocket and TCP behave consistently without per-project env. - max_retries = max(1, int(getattr(config, "reload_max_retries", 40))) + # Bound waiting for Unity sessions so calls fail fast when editors are not ready. + try: + max_wait_s = float( + os.environ.get("UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "2.0")) + except ValueError as e: + raw_val = os.environ.get("UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "2.0") + logger.warning( + "Invalid UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S=%r, using default 2.0: %s", + raw_val, e) + max_wait_s = 2.0 + # Clamp to [0, 30] to prevent misconfiguration from causing excessive waits + max_wait_s = max(0.0, min(max_wait_s, 30.0)) retry_ms = float(getattr(config, "reload_retry_ms", 250)) - sleep_seconds = max(0.05, retry_ms / 1000.0) + sleep_seconds = max(0.05, min(0.25, retry_ms / 1000.0)) # Allow callers to provide either just the hash or Name@hash target_hash: str | None = None @@ -394,7 +407,7 @@ class PluginHub(WebSocketEndpoint): return None, count session_id, session_count = await _try_once() - deadline = time.monotonic() + (max_retries * sleep_seconds) + deadline = time.monotonic() + max_wait_s wait_started = None # If there is no active plugin yet (e.g., Unity starting up or reloading), @@ -408,14 +421,18 @@ class PluginHub(WebSocketEndpoint): if wait_started is None: wait_started = time.monotonic() logger.debug( - f"No plugin session available (instance={unity_instance or 'default'}); waiting up to {deadline - wait_started:.2f}s", + "No plugin session available (instance=%s); waiting up to %.2fs", + unity_instance or "default", + max_wait_s, ) await asyncio.sleep(sleep_seconds) session_id, session_count = await _try_once() if session_id is not None and wait_started is not None: logger.debug( - f"Plugin session restored after {time.monotonic() - wait_started:.3f}s (instance={unity_instance or 'default'})", + "Plugin session restored after %.3fs (instance=%s)", + time.monotonic() - wait_started, + unity_instance or "default", ) if session_id is None and not target_hash and session_count > 1: raise RuntimeError( @@ -425,11 +442,13 @@ class PluginHub(WebSocketEndpoint): if session_id is None: logger.warning( - f"No Unity plugin reconnected within {max_retries * sleep_seconds:.2f}s (instance={unity_instance or 'default'})", + "No Unity plugin reconnected within %.2fs (instance=%s)", + max_wait_s, + unity_instance or "default", ) # At this point we've given the plugin ample time to reconnect; surface # a clear error so the client can prompt the user to open Unity. - raise RuntimeError("No Unity plugins are currently connected") + raise NoUnitySessionError("No Unity plugins are currently connected") return session_id @@ -440,7 +459,20 @@ class PluginHub(WebSocketEndpoint): command_type: str, params: dict[str, Any], ) -> dict[str, Any]: - session_id = await cls._resolve_session_id(unity_instance) + try: + session_id = await cls._resolve_session_id(unity_instance) + except NoUnitySessionError: + logger.debug( + "Unity session unavailable; returning retry: command=%s instance=%s", + command_type, + unity_instance or "default", + ) + return MCPResponse( + success=False, + error="Unity session not available; please retry", + hint="retry", + data={"reason": "no_unity_session", "retry_after_ms": 250}, + ).model_dump() # During domain reload / immediate reconnect windows, the plugin may be connected but not yet # ready to process execute commands on the Unity main thread (which can be further delayed when @@ -450,7 +482,11 @@ class PluginHub(WebSocketEndpoint): if command_type in cls._FAST_FAIL_COMMANDS and command_type != "ping": try: max_wait_s = float(os.environ.get("UNITY_MCP_SESSION_READY_WAIT_SECONDS", "6")) - except Exception: + except ValueError as e: + raw_val = os.environ.get("UNITY_MCP_SESSION_READY_WAIT_SECONDS", "6") + logger.warning( + "Invalid UNITY_MCP_SESSION_READY_WAIT_SECONDS=%r, using default 6.0: %s", + raw_val, e) max_wait_s = 6.0 max_wait_s = max(0.0, min(max_wait_s, 30.0)) if max_wait_s > 0: