Fix/ci cleanup (#484)

* CI: tighten NL/T reporting and fail job on missing/failed tests * CI: include test titles in summary checklist
2025-12-24 13:12:26 -08:00 · 2025-12-24 13:12:26 -08:00 · 8e3bd1d70c
parent 91b6f4d8d6
commit 8e3bd1d70c
2 changed files with 111 additions and 7 deletions
--- a/.claude/prompts/nl-unity-suite-nl.md
+++ b/.claude/prompts/nl-unity-suite-nl.md
@ -18,15 +18,15 @@ AllowedTools: Write,mcp__unity__manage_editor,mcp__unity__list_resources,mcp__un
 - Each file must contain EXACTLY one `<testcase>` root element
 - NO prologue, epilogue, code fences, or extra characters
 - NO markdown formatting or explanations outside the XML
- Use this exact format:
+- Use this exact shape (write the XML directly into the file; do not wrap it in ``` fences):

-```xml
 <testcase name="NL-0 — Baseline State Capture" classname="UnityMCP.NL-T">
  <system-out><![CDATA[
 (evidence of what was accomplished)
  ]]></system-out>
 </testcase>
-```
+
+- Must end with the closing tag `</testcase>` (well‑formed XML only).

 - If test fails, include: `<failure message="reason"/>`
 - TESTID must be one of: NL-0, NL-1, NL-2, NL-3, NL-4
--- a/.github/workflows/claude-nl-suite.yml
+++ b/.github/workflows/claude-nl-suite.yml
@ -858,9 +858,11 @@ jobs:
          from pathlib import Path
          import xml.etree.ElementTree as ET
          import re
+          import shutil

          DESIRED = ["NL-0","NL-1","NL-2","NL-3","NL-4","T-A","T-B","T-C","T-D","T-E","T-F","T-G","T-H","T-I","T-J"]
          seen = set()
+          bad = set()
          def id_from_filename(p: Path):
            n = p.name
            m = re.match(r'NL-?(\d+)_results\.xml$', n, re.I)
@ -872,12 +874,23 @@ jobs:
            return None

          for p in Path("reports").glob("*_results.xml"):
+            fid = id_from_filename(p)
            try:
              r = ET.parse(p).getroot()
            except Exception:
+              # If the file exists but isn't parseable, preserve it for debugging and
+              # treat it as a failing (malformed) fragment rather than "not produced".
+              if fid in DESIRED and p.exists() and p.stat().st_size > 0:
+                staging = Path("reports/_staging")
+                staging.mkdir(parents=True, exist_ok=True)
+                preserved = staging / f"{fid}_malformed.xml"
+                try:
+                  shutil.copyfile(p, preserved)
+                except Exception:
+                  pass
+                bad.add(fid)
              continue
            # Count by filename id primarily; fall back to testcase name if needed
-            fid = id_from_filename(p)
            if fid in DESIRED:
              seen.add(fid)
              continue
@ -894,8 +907,12 @@ jobs:
              continue
            frag = Path(f"reports/{d}_results.xml")
            tc = ET.Element("testcase", {"classname":"UnityMCP.NL-T", "name": d})
-            fail = ET.SubElement(tc, "failure", {"message":"not produced"})
-            fail.text = "The agent did not emit a fragment for this test."
+            if d in bad:
+              fail = ET.SubElement(tc, "failure", {"message":"malformed xml"})
+              fail.text = "The agent wrote a fragment file, but it was not valid XML (parse failed). See reports/_staging/*_malformed.xml for the preserved original."
+            else:
+              fail = ET.SubElement(tc, "failure", {"message":"not produced"})
+              fail.text = "The agent did not emit a fragment for this test."
            ET.ElementTree(tc).write(frag, encoding="utf-8", xml_declaration=False)
            print(f"backfill: {d}")
          PY
@ -1064,6 +1081,31 @@ jobs:
                  name_map[tid] = (tc.get('name') or tid)

          desired = ['NL-0','NL-1','NL-2','NL-3','NL-4','T-A','T-B','T-C','T-D','T-E','T-F','T-G','T-H','T-I','T-J']
+          default_titles = {
+              'NL-0': 'Baseline State Capture',
+              'NL-1': 'Core Method Operations',
+              'NL-2': 'Anchor Comment Insertion',
+              'NL-3': 'End-of-Class Content',
+              'NL-4': 'Console State Verification',
+              'T-A': 'Temporary Helper',
+              'T-B': 'Method Body Interior',
+              'T-C': 'Different Method Interior',
+              'T-D': 'End-of-Class Helper',
+              'T-E': 'Method Evolution',
+              'T-F': 'Atomic Multi-Edit',
+              'T-G': 'Path Normalization',
+              'T-H': 'Validation on Modified',
+              'T-I': 'Failure Surface',
+              'T-J': 'Idempotency',
+          }
+
+          def display_name(test_id: str) -> str:
+              # Prefer the emitted testcase "name" attribute (it may already include ID + title).
+              n = (name_map.get(test_id) or '').strip()
+              if n:
+                  return n
+              t = (default_titles.get(test_id) or '').strip()
+              return f"{test_id} — {t}" if t else test_id

          total = len(cases)
          failures = sum(1 for tc in cases if (tc.find('failure') is not None or tc.find('error') is not None))
@ -1079,7 +1121,8 @@ jobs:
          ]
          for p in desired:
              st = id_status.get(p, None)
-              lines.append(f"- [x] {p}" if st is True else (f"- [ ] {p} (fail)" if st is False else f"- [ ] {p} (not run)"))
+              label = display_name(p)
+              lines.append(f"- [x] {label}" if st is True else (f"- [ ] {label} (fail)" if st is False else f"- [ ] {label} (not run)"))
          lines.append('')

          lines.append('## Test Details')
@ -1136,6 +1179,67 @@ jobs:
          md_out.write_text('\n'.join(lines), encoding='utf-8')
          PY

+      # ---------- CI gate: fail job if any NL/T test missing or failed ----------
+      - name: Fail CI if NL/T incomplete or failed
+        if: always()
+        shell: bash
+        run: |
+          python3 - <<'PY'
+          import os, re, sys
+          from pathlib import Path
+          import xml.etree.ElementTree as ET
+
+          desired = ['NL-0','NL-1','NL-2','NL-3','NL-4','T-A','T-B','T-C','T-D','T-E','T-F','T-G','T-H','T-I','T-J']
+
+          junit_path = Path(os.environ.get('JUNIT_OUT', 'reports/junit-nl-suite.xml'))
+          if not junit_path.exists():
+              print("::error::No JUnit output found; failing CI gate.")
+              sys.exit(1)
+
+          def localname(tag: str) -> str:
+              return tag.rsplit('}', 1)[-1] if '}' in tag else tag
+
+          tree = ET.parse(junit_path)
+          root = tree.getroot()
+          suite = root.find('./*') if localname(root.tag) == 'testsuites' else root
+          cases = [] if suite is None else list(suite.findall('.//testcase'))
+
+          def id_from_case(tc):
+              name = (tc.get('name') or '').strip()
+              m = re.match(r'(NL-\d+|T-[A-Z])\b', name)
+              if m:
+                  return m.group(1)
+              so = tc.find('system-out')
+              if so is not None and so.text:
+                  m = re.search(r'\b(NL-\d+|T-[A-Z])\b', so.text)
+                  if m:
+                      return m.group(1)
+              return None
+
+          # Determine status per desired ID (first occurrence wins, matching the summary builder)
+          id_status = {}
+          for tc in cases:
+              tid = id_from_case(tc)
+              if not tid or tid not in desired or tid in id_status:
+                  continue
+              ok = (tc.find('failure') is None and tc.find('error') is None)
+              id_status[tid] = ok
+
+          missing = [d for d in desired if d not in id_status]
+          failed = [d for d, ok in id_status.items() if ok is False]
+
+          if missing:
+              print(f"::error::Missing NL/T tests in JUnit: {' '.join(missing)}")
+          if failed:
+              print(f"::error::Failing NL/T tests in JUnit: {' '.join(sorted(failed))}")
+
+          # Gate: all desired must be present and passing
+          if missing or failed:
+              sys.exit(1)
+
+          print("NL/T CI gate passed: all required tests present and passing.")
+          PY
+
      # ---------- Collect execution transcript (if present) ----------
      - name: Collect action execution transcript
        if: always()