Codex/implement bounded retry policy for unity (#510)
* Add editor readiness v2, refresh tool, and preflight guards * Detect external package changes and harden refresh retry * feat: add TestRunnerNoThrottle and async test running with background stall prevention - Add TestRunnerNoThrottle.cs: Sets editor to 'No Throttling' mode during test runs with SessionState persistence across domain reload - Add run_tests_async and get_test_job tools for non-blocking test execution - Add TestJobManager for async test job tracking with progress monitoring - Add ForceSynchronousImport to all AssetDatabase.Refresh() calls to prevent stalls - Mark DomainReloadResilienceTests as [Explicit] with documentation explaining the test infrastructure limitation (internal coroutine waits vs MCP socket polling) - MCP workflow is unaffected - socket messages provide external stimulus that keeps Unity responsive even when backgrounded * refactor: simplify and clean up code - Remove unused Newtonsoft.Json.Linq import from TestJobManager - Add throttling to SessionState persistence (once per second) to reduce overhead - Critical job state changes (start/finish) still persist immediately - Fix duplicate XML summary tag in DomainReloadResilienceTests * docs: add async test tools to README, document domain reload limitation - Add run_tests_async and get_test_job to main README tools list - Document background stall limitation for domain reload tests in DEV readme * ci: add separate job for domain reload tests Run [Explicit] domain_reload tests in their own job using -testCategory * ci: run domain reload tests in same job as regular tests Combines into single job with two test steps to reuse cached Library * fix: address coderabbit review issues - Fix TOCTOU race in TestJobManager.StartJob (single lock scope for check-and-set) - Store TestRunnerApi reference with HideAndDontSave to prevent GC/serialization issues * docs: update tool descriptions to prefer run_tests_async - run_tests_async is now marked as preferred for long-running suites - run_tests description notes it blocks and suggests async alternative * docs: update README screenshot to v8.6 UI * docs: add v8.6 UI screenshot * docs: update v8.6 UI screenshot * docs: update v8.6 UI screenshot * docs: update v8.6 UI screenshot * Update README for MCP version and instructions for v8.7 * fix: handle preflight busy signals and derive job status from test results - manage_asset, manage_gameobject, manage_scene now check preflight return value and propagate busy/retry signals to clients (fixes Sourcery #1) - TestJobManager.FinalizeCurrentJobFromRunFinished now sets job status to Failed when resultPayload.Failed > 0, not always Succeeded (fixes Sourcery #2) * fix: increase HTTP server startup timeout for dev mode When 'Force fresh server install' is enabled, uvx uses --no-cache --refresh which rebuilds the package and takes significantly longer to start. - Increase timeout from 10s to 45s when dev mode is enabled - Add informative log message explaining the longer startup time - Show actual timeout value in warning message * fix: derive job status from test results in FinalizeFromTask fallback Apply same logic as FinalizeCurrentJobFromRunFinished: check result.Failed > 0 to correctly mark jobs as Failed when tests fail, even in the fallback path when RunFinished callback is not delivered. * Bound Unity reload/session waits * refactor: improve env var parsing and reason extraction Address code review feedback: - Catch ValueError specifically (instead of broad Exception) when parsing UNITY_MCP_RELOAD_MAX_WAIT_S, UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S, and UNITY_MCP_SESSION_READY_WAIT_SECONDS, with logging for easier diagnosis of misconfiguration - Normalize reason values to lowercase in _extract_response_reason() to avoid case-sensitive mismatches in comparisons - Simplify refresh_unity.py by removing redundant isinstance check and reusing _extract_response_reason instead of duplicating reason parsing * Add upper bound clamp and custom exception for bounded retry - Add upper bound (30s) to UNITY_MCP_RELOAD_MAX_WAIT_S to prevent misconfiguration from causing excessive waits - Add upper bound (30s) to UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S for consistency with readiness probe - Introduce NoUnitySessionError custom exception to replace fragile string matching in send_command_for_instance Addresses code review feedback for bounded retry policy PR.main
parent
6bbf137685
commit
96b81caf26
|
|
@ -10,7 +10,7 @@ from models import MCPResponse
|
||||||
from services.registry import mcp_for_unity_tool
|
from services.registry import mcp_for_unity_tool
|
||||||
from services.tools import get_unity_instance_from_context
|
from services.tools import get_unity_instance_from_context
|
||||||
import transport.unity_transport as unity_transport
|
import transport.unity_transport as unity_transport
|
||||||
from transport.legacy.unity_connection import async_send_command_with_retry
|
from transport.legacy.unity_connection import async_send_command_with_retry, _extract_response_reason
|
||||||
from services.state.external_changes_scanner import external_changes_scanner
|
from services.state.external_changes_scanner import external_changes_scanner
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -47,10 +47,12 @@ async def refresh_unity(
|
||||||
if isinstance(response, dict) and not response.get("success", True):
|
if isinstance(response, dict) and not response.get("success", True):
|
||||||
hint = response.get("hint")
|
hint = response.get("hint")
|
||||||
err = (response.get("error") or response.get("message") or "")
|
err = (response.get("error") or response.get("message") or "")
|
||||||
|
reason = _extract_response_reason(response)
|
||||||
is_retryable = (hint == "retry") or ("disconnected" in str(err).lower())
|
is_retryable = (hint == "retry") or ("disconnected" in str(err).lower())
|
||||||
if (not wait_for_ready) or (not is_retryable):
|
if (not wait_for_ready) or (not is_retryable):
|
||||||
return MCPResponse(**response)
|
return MCPResponse(**response)
|
||||||
recovered_from_disconnect = True
|
if reason not in {"reloading", "no_unity_session"}:
|
||||||
|
recovered_from_disconnect = True
|
||||||
|
|
||||||
# Optional server-side wait loop (defensive): if Unity tool doesn't wait or returns quickly,
|
# Optional server-side wait loop (defensive): if Unity tool doesn't wait or returns quickly,
|
||||||
# poll the canonical editor_state v2 resource until ready or timeout.
|
# poll the canonical editor_state v2 resource until ready or timeout.
|
||||||
|
|
@ -86,5 +88,3 @@ async def refresh_unity(
|
||||||
)
|
)
|
||||||
|
|
||||||
return MCPResponse(**response) if isinstance(response, dict) else response
|
return MCPResponse(**response) if isinstance(response, dict) else response
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -686,28 +686,46 @@ def get_unity_connection(instance_identifier: str | None = None) -> UnityConnect
|
||||||
# Centralized retry helpers
|
# Centralized retry helpers
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
|
|
||||||
|
def _extract_response_reason(resp: object) -> str | None:
|
||||||
|
"""Extract a normalized (lowercase) reason string from a response.
|
||||||
|
|
||||||
|
Returns lowercase reason values to enable case-insensitive comparisons
|
||||||
|
by callers (e.g. _is_reloading_response, refresh_unity).
|
||||||
|
"""
|
||||||
|
if isinstance(resp, MCPResponse):
|
||||||
|
data = getattr(resp, "data", None)
|
||||||
|
if isinstance(data, dict):
|
||||||
|
reason = data.get("reason")
|
||||||
|
if isinstance(reason, str):
|
||||||
|
return reason.lower()
|
||||||
|
message_text = f"{resp.message or ''} {resp.error or ''}".lower()
|
||||||
|
if "reload" in message_text:
|
||||||
|
return "reloading"
|
||||||
|
return None
|
||||||
|
|
||||||
|
if isinstance(resp, dict):
|
||||||
|
if resp.get("state") == "reloading":
|
||||||
|
return "reloading"
|
||||||
|
data = resp.get("data")
|
||||||
|
if isinstance(data, dict):
|
||||||
|
reason = data.get("reason")
|
||||||
|
if isinstance(reason, str):
|
||||||
|
return reason.lower()
|
||||||
|
message_text = (resp.get("message") or resp.get("error") or "").lower()
|
||||||
|
if "reload" in message_text:
|
||||||
|
return "reloading"
|
||||||
|
return None
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _is_reloading_response(resp: object) -> bool:
|
def _is_reloading_response(resp: object) -> bool:
|
||||||
"""Return True if the Unity response indicates the editor is reloading.
|
"""Return True if the Unity response indicates the editor is reloading.
|
||||||
|
|
||||||
Supports both raw dict payloads from Unity and MCPResponse objects returned
|
Supports both raw dict payloads from Unity and MCPResponse objects returned
|
||||||
by preflight checks or transport helpers.
|
by preflight checks or transport helpers.
|
||||||
"""
|
"""
|
||||||
# Structured MCPResponse from preflight/transport
|
return _extract_response_reason(resp) == "reloading"
|
||||||
if isinstance(resp, MCPResponse):
|
|
||||||
# Explicit "please retry" hint from preflight
|
|
||||||
if getattr(resp, "hint", None) == "retry":
|
|
||||||
return True
|
|
||||||
message_text = f"{resp.message or ''} {resp.error or ''}".lower()
|
|
||||||
return "reload" in message_text
|
|
||||||
|
|
||||||
# Raw Unity payloads
|
|
||||||
if isinstance(resp, dict):
|
|
||||||
if resp.get("state") == "reloading":
|
|
||||||
return True
|
|
||||||
message_text = (resp.get("message") or resp.get("error") or "").lower()
|
|
||||||
return "reload" in message_text
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def send_command_with_retry(
|
def send_command_with_retry(
|
||||||
|
|
@ -738,15 +756,82 @@ def send_command_with_retry(
|
||||||
max_retries = getattr(config, "reload_max_retries", 40)
|
max_retries = getattr(config, "reload_max_retries", 40)
|
||||||
if retry_ms is None:
|
if retry_ms is None:
|
||||||
retry_ms = getattr(config, "reload_retry_ms", 250)
|
retry_ms = getattr(config, "reload_retry_ms", 250)
|
||||||
|
try:
|
||||||
|
max_wait_s = float(os.environ.get(
|
||||||
|
"UNITY_MCP_RELOAD_MAX_WAIT_S", "2.0"))
|
||||||
|
except ValueError as e:
|
||||||
|
raw_val = os.environ.get("UNITY_MCP_RELOAD_MAX_WAIT_S", "2.0")
|
||||||
|
logger.warning(
|
||||||
|
"Invalid UNITY_MCP_RELOAD_MAX_WAIT_S=%r, using default 2.0: %s",
|
||||||
|
raw_val, e)
|
||||||
|
max_wait_s = 2.0
|
||||||
|
# Clamp to [0, 30] to prevent misconfiguration from causing excessive waits
|
||||||
|
max_wait_s = max(0.0, min(max_wait_s, 30.0))
|
||||||
|
|
||||||
response = conn.send_command(command_type, params)
|
response = conn.send_command(command_type, params)
|
||||||
retries = 0
|
retries = 0
|
||||||
|
wait_started = None
|
||||||
|
reason = _extract_response_reason(response)
|
||||||
while _is_reloading_response(response) and retries < max_retries:
|
while _is_reloading_response(response) and retries < max_retries:
|
||||||
delay_ms = int(response.get("retry_after_ms", retry_ms)
|
if wait_started is None:
|
||||||
) if isinstance(response, dict) else retry_ms
|
wait_started = time.monotonic()
|
||||||
time.sleep(max(0.0, delay_ms / 1000.0))
|
logger.debug(
|
||||||
|
"Unity reload wait started: command=%s instance=%s reason=%s max_wait_s=%.2f",
|
||||||
|
command_type,
|
||||||
|
instance_id or "default",
|
||||||
|
reason or "reloading",
|
||||||
|
max_wait_s,
|
||||||
|
)
|
||||||
|
if max_wait_s <= 0:
|
||||||
|
break
|
||||||
|
elapsed = time.monotonic() - wait_started
|
||||||
|
if elapsed >= max_wait_s:
|
||||||
|
break
|
||||||
|
delay_ms = retry_ms
|
||||||
|
if isinstance(response, dict):
|
||||||
|
retry_after = response.get("retry_after_ms")
|
||||||
|
if retry_after is None and isinstance(response.get("data"), dict):
|
||||||
|
retry_after = response["data"].get("retry_after_ms")
|
||||||
|
if retry_after is not None:
|
||||||
|
delay_ms = int(retry_after)
|
||||||
|
sleep_ms = max(50, min(int(delay_ms), 250))
|
||||||
|
logger.debug(
|
||||||
|
"Unity reload wait retry: command=%s instance=%s reason=%s retry_after_ms=%s sleep_ms=%s",
|
||||||
|
command_type,
|
||||||
|
instance_id or "default",
|
||||||
|
reason or "reloading",
|
||||||
|
delay_ms,
|
||||||
|
sleep_ms,
|
||||||
|
)
|
||||||
|
time.sleep(max(0.0, sleep_ms / 1000.0))
|
||||||
retries += 1
|
retries += 1
|
||||||
response = conn.send_command(command_type, params)
|
response = conn.send_command(command_type, params)
|
||||||
|
reason = _extract_response_reason(response)
|
||||||
|
|
||||||
|
if wait_started is not None:
|
||||||
|
waited = time.monotonic() - wait_started
|
||||||
|
if _is_reloading_response(response):
|
||||||
|
logger.debug(
|
||||||
|
"Unity reload wait exceeded budget: command=%s instance=%s waited_s=%.3f",
|
||||||
|
command_type,
|
||||||
|
instance_id or "default",
|
||||||
|
waited,
|
||||||
|
)
|
||||||
|
return MCPResponse(
|
||||||
|
success=False,
|
||||||
|
error="Unity is reloading; please retry",
|
||||||
|
hint="retry",
|
||||||
|
data={
|
||||||
|
"reason": "reloading",
|
||||||
|
"retry_after_ms": min(250, max(50, retry_ms)),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
logger.debug(
|
||||||
|
"Unity reload wait completed: command=%s instance=%s waited_s=%.3f",
|
||||||
|
command_type,
|
||||||
|
instance_id or "default",
|
||||||
|
waited,
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,10 @@ class PluginDisconnectedError(RuntimeError):
|
||||||
"""Raised when a plugin WebSocket disconnects while commands are in flight."""
|
"""Raised when a plugin WebSocket disconnects while commands are in flight."""
|
||||||
|
|
||||||
|
|
||||||
|
class NoUnitySessionError(RuntimeError):
|
||||||
|
"""Raised when no Unity plugins are available."""
|
||||||
|
|
||||||
|
|
||||||
class PluginHub(WebSocketEndpoint):
|
class PluginHub(WebSocketEndpoint):
|
||||||
"""Manages persistent WebSocket connections to Unity plugins."""
|
"""Manages persistent WebSocket connections to Unity plugins."""
|
||||||
|
|
||||||
|
|
@ -361,11 +365,20 @@ class PluginHub(WebSocketEndpoint):
|
||||||
if cls._registry is None:
|
if cls._registry is None:
|
||||||
raise RuntimeError("Plugin registry not configured")
|
raise RuntimeError("Plugin registry not configured")
|
||||||
|
|
||||||
# Use the same defaults as the stdio transport reload handling so that
|
# Bound waiting for Unity sessions so calls fail fast when editors are not ready.
|
||||||
# HTTP/WebSocket and TCP behave consistently without per-project env.
|
try:
|
||||||
max_retries = max(1, int(getattr(config, "reload_max_retries", 40)))
|
max_wait_s = float(
|
||||||
|
os.environ.get("UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "2.0"))
|
||||||
|
except ValueError as e:
|
||||||
|
raw_val = os.environ.get("UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "2.0")
|
||||||
|
logger.warning(
|
||||||
|
"Invalid UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S=%r, using default 2.0: %s",
|
||||||
|
raw_val, e)
|
||||||
|
max_wait_s = 2.0
|
||||||
|
# Clamp to [0, 30] to prevent misconfiguration from causing excessive waits
|
||||||
|
max_wait_s = max(0.0, min(max_wait_s, 30.0))
|
||||||
retry_ms = float(getattr(config, "reload_retry_ms", 250))
|
retry_ms = float(getattr(config, "reload_retry_ms", 250))
|
||||||
sleep_seconds = max(0.05, retry_ms / 1000.0)
|
sleep_seconds = max(0.05, min(0.25, retry_ms / 1000.0))
|
||||||
|
|
||||||
# Allow callers to provide either just the hash or Name@hash
|
# Allow callers to provide either just the hash or Name@hash
|
||||||
target_hash: str | None = None
|
target_hash: str | None = None
|
||||||
|
|
@ -394,7 +407,7 @@ class PluginHub(WebSocketEndpoint):
|
||||||
return None, count
|
return None, count
|
||||||
|
|
||||||
session_id, session_count = await _try_once()
|
session_id, session_count = await _try_once()
|
||||||
deadline = time.monotonic() + (max_retries * sleep_seconds)
|
deadline = time.monotonic() + max_wait_s
|
||||||
wait_started = None
|
wait_started = None
|
||||||
|
|
||||||
# If there is no active plugin yet (e.g., Unity starting up or reloading),
|
# If there is no active plugin yet (e.g., Unity starting up or reloading),
|
||||||
|
|
@ -408,14 +421,18 @@ class PluginHub(WebSocketEndpoint):
|
||||||
if wait_started is None:
|
if wait_started is None:
|
||||||
wait_started = time.monotonic()
|
wait_started = time.monotonic()
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"No plugin session available (instance={unity_instance or 'default'}); waiting up to {deadline - wait_started:.2f}s",
|
"No plugin session available (instance=%s); waiting up to %.2fs",
|
||||||
|
unity_instance or "default",
|
||||||
|
max_wait_s,
|
||||||
)
|
)
|
||||||
await asyncio.sleep(sleep_seconds)
|
await asyncio.sleep(sleep_seconds)
|
||||||
session_id, session_count = await _try_once()
|
session_id, session_count = await _try_once()
|
||||||
|
|
||||||
if session_id is not None and wait_started is not None:
|
if session_id is not None and wait_started is not None:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Plugin session restored after {time.monotonic() - wait_started:.3f}s (instance={unity_instance or 'default'})",
|
"Plugin session restored after %.3fs (instance=%s)",
|
||||||
|
time.monotonic() - wait_started,
|
||||||
|
unity_instance or "default",
|
||||||
)
|
)
|
||||||
if session_id is None and not target_hash and session_count > 1:
|
if session_id is None and not target_hash and session_count > 1:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
|
|
@ -425,11 +442,13 @@ class PluginHub(WebSocketEndpoint):
|
||||||
|
|
||||||
if session_id is None:
|
if session_id is None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"No Unity plugin reconnected within {max_retries * sleep_seconds:.2f}s (instance={unity_instance or 'default'})",
|
"No Unity plugin reconnected within %.2fs (instance=%s)",
|
||||||
|
max_wait_s,
|
||||||
|
unity_instance or "default",
|
||||||
)
|
)
|
||||||
# At this point we've given the plugin ample time to reconnect; surface
|
# At this point we've given the plugin ample time to reconnect; surface
|
||||||
# a clear error so the client can prompt the user to open Unity.
|
# a clear error so the client can prompt the user to open Unity.
|
||||||
raise RuntimeError("No Unity plugins are currently connected")
|
raise NoUnitySessionError("No Unity plugins are currently connected")
|
||||||
|
|
||||||
return session_id
|
return session_id
|
||||||
|
|
||||||
|
|
@ -440,7 +459,20 @@ class PluginHub(WebSocketEndpoint):
|
||||||
command_type: str,
|
command_type: str,
|
||||||
params: dict[str, Any],
|
params: dict[str, Any],
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
session_id = await cls._resolve_session_id(unity_instance)
|
try:
|
||||||
|
session_id = await cls._resolve_session_id(unity_instance)
|
||||||
|
except NoUnitySessionError:
|
||||||
|
logger.debug(
|
||||||
|
"Unity session unavailable; returning retry: command=%s instance=%s",
|
||||||
|
command_type,
|
||||||
|
unity_instance or "default",
|
||||||
|
)
|
||||||
|
return MCPResponse(
|
||||||
|
success=False,
|
||||||
|
error="Unity session not available; please retry",
|
||||||
|
hint="retry",
|
||||||
|
data={"reason": "no_unity_session", "retry_after_ms": 250},
|
||||||
|
).model_dump()
|
||||||
|
|
||||||
# During domain reload / immediate reconnect windows, the plugin may be connected but not yet
|
# During domain reload / immediate reconnect windows, the plugin may be connected but not yet
|
||||||
# ready to process execute commands on the Unity main thread (which can be further delayed when
|
# ready to process execute commands on the Unity main thread (which can be further delayed when
|
||||||
|
|
@ -450,7 +482,11 @@ class PluginHub(WebSocketEndpoint):
|
||||||
if command_type in cls._FAST_FAIL_COMMANDS and command_type != "ping":
|
if command_type in cls._FAST_FAIL_COMMANDS and command_type != "ping":
|
||||||
try:
|
try:
|
||||||
max_wait_s = float(os.environ.get("UNITY_MCP_SESSION_READY_WAIT_SECONDS", "6"))
|
max_wait_s = float(os.environ.get("UNITY_MCP_SESSION_READY_WAIT_SECONDS", "6"))
|
||||||
except Exception:
|
except ValueError as e:
|
||||||
|
raw_val = os.environ.get("UNITY_MCP_SESSION_READY_WAIT_SECONDS", "6")
|
||||||
|
logger.warning(
|
||||||
|
"Invalid UNITY_MCP_SESSION_READY_WAIT_SECONDS=%r, using default 6.0: %s",
|
||||||
|
raw_val, e)
|
||||||
max_wait_s = 6.0
|
max_wait_s = 6.0
|
||||||
max_wait_s = max(0.0, min(max_wait_s, 30.0))
|
max_wait_s = max(0.0, min(max_wait_s, 30.0))
|
||||||
if max_wait_s > 0:
|
if max_wait_s > 0:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue