Fix/ci cleanup (#484)

* CI: tighten NL/T reporting and fail job on missing/failed tests * CI: include test titles in summary checklist
2025-12-24 13:12:26 -08:00 · 2025-12-24 13:12:26 -08:00 · 8e3bd1d70c
parent 91b6f4d8d6
commit 8e3bd1d70c
2 changed files with 111 additions and 7 deletions
--- a/.claude/prompts/nl-unity-suite-nl.md
+++ b/.claude/prompts/nl-unity-suite-nl.md
@ -18,15 +18,15 @@ AllowedTools: Write,mcp__unity__manage_editor,mcp__unity__list_resources,mcp__un
 - Each file must contain EXACTLY one `<testcase>` root element
 - NO prologue, epilogue, code fences, or extra characters
 - NO markdown formatting or explanations outside the XML
- Use this exact format:
+- Use this exact shape (write the XML directly into the file; do not wrap it in ``` fences):
 ```xml
 <testcase name="NL-0 — Baseline State Capture" classname="UnityMCP.NL-T">
  <system-out><![CDATA[
 (evidence of what was accomplished)
  ]]></system-out>
 </testcase>
-```
+
 - Must end with the closing tag `</testcase>` (well‑formed XML only).
 - If test fails, include: `<failure message="reason"/>`
 - TESTID must be one of: NL-0, NL-1, NL-2, NL-3, NL-4
--- a/.github/workflows/claude-nl-suite.yml
+++ b/.github/workflows/claude-nl-suite.yml
@ -858,9 +858,11 @@ jobs:
          from pathlib import Path
          import xml.etree.ElementTree as ET
          import re
          import shutil
          DESIRED = ["NL-0","NL-1","NL-2","NL-3","NL-4","T-A","T-B","T-C","T-D","T-E","T-F","T-G","T-H","T-I","T-J"]
          seen = set()
          bad = set()
          def id_from_filename(p: Path):
            n = p.name
            m = re.match(r'NL-?(\d+)_results\.xml$', n, re.I)
@ -872,12 +874,23 @@ jobs:
            return None
          for p in Path("reports").glob("*_results.xml"):
            fid = id_from_filename(p)
            try:
              r = ET.parse(p).getroot()
            except Exception:
              # If the file exists but isn't parseable, preserve it for debugging and
              # treat it as a failing (malformed) fragment rather than "not produced".
              if fid in DESIRED and p.exists() and p.stat().st_size > 0:
                staging = Path("reports/_staging")
                staging.mkdir(parents=True, exist_ok=True)
                preserved = staging / f"{fid}_malformed.xml"
                try:
                  shutil.copyfile(p, preserved)
                except Exception:
                  pass
                bad.add(fid)
              continue
            # Count by filename id primarily; fall back to testcase name if needed
            fid = id_from_filename(p)
            if fid in DESIRED:
              seen.add(fid)
              continue
@ -894,8 +907,12 @@ jobs:
              continue
            frag = Path(f"reports/{d}_results.xml")
            tc = ET.Element("testcase", {"classname":"UnityMCP.NL-T", "name": d})
-            fail = ET.SubElement(tc, "failure", {"message":"not produced"})
+            if d in bad:
-            fail.text = "The agent did not emit a fragment for this test."
+              fail = ET.SubElement(tc, "failure", {"message":"malformed xml"})
              fail.text = "The agent wrote a fragment file, but it was not valid XML (parse failed). See reports/_staging/*_malformed.xml for the preserved original."
            else:
              fail = ET.SubElement(tc, "failure", {"message":"not produced"})
              fail.text = "The agent did not emit a fragment for this test."
            ET.ElementTree(tc).write(frag, encoding="utf-8", xml_declaration=False)
            print(f"backfill: {d}")
          PY
@ -1064,6 +1081,31 @@ jobs:
                  name_map[tid] = (tc.get('name') or tid)
          desired = ['NL-0','NL-1','NL-2','NL-3','NL-4','T-A','T-B','T-C','T-D','T-E','T-F','T-G','T-H','T-I','T-J']
          default_titles = {
              'NL-0': 'Baseline State Capture',
              'NL-1': 'Core Method Operations',
              'NL-2': 'Anchor Comment Insertion',
              'NL-3': 'End-of-Class Content',
              'NL-4': 'Console State Verification',
              'T-A': 'Temporary Helper',
              'T-B': 'Method Body Interior',
              'T-C': 'Different Method Interior',
              'T-D': 'End-of-Class Helper',
              'T-E': 'Method Evolution',
              'T-F': 'Atomic Multi-Edit',
              'T-G': 'Path Normalization',
              'T-H': 'Validation on Modified',
              'T-I': 'Failure Surface',
              'T-J': 'Idempotency',
          }
          def display_name(test_id: str) -> str:
              # Prefer the emitted testcase "name" attribute (it may already include ID + title).
              n = (name_map.get(test_id) or '').strip()
              if n:
                  return n
              t = (default_titles.get(test_id) or '').strip()
              return f"{test_id} — {t}" if t else test_id
          total = len(cases)
          failures = sum(1 for tc in cases if (tc.find('failure') is not None or tc.find('error') is not None))
@ -1079,7 +1121,8 @@ jobs:
          ]
          for p in desired:
              st = id_status.get(p, None)
-              lines.append(f"- [x] {p}" if st is True else (f"- [ ] {p} (fail)" if st is False else f"- [ ] {p} (not run)"))
+              label = display_name(p)
              lines.append(f"- [x] {label}" if st is True else (f"- [ ] {label} (fail)" if st is False else f"- [ ] {label} (not run)"))
          lines.append('')
          lines.append('## Test Details')
@ -1136,6 +1179,67 @@ jobs:
          md_out.write_text('\n'.join(lines), encoding='utf-8')
          PY
      # ---------- CI gate: fail job if any NL/T test missing or failed ----------
      - name: Fail CI if NL/T incomplete or failed
        if: always()
        shell: bash
        run: |
          python3 - <<'PY'
          import os, re, sys
          from pathlib import Path
          import xml.etree.ElementTree as ET
          desired = ['NL-0','NL-1','NL-2','NL-3','NL-4','T-A','T-B','T-C','T-D','T-E','T-F','T-G','T-H','T-I','T-J']
          junit_path = Path(os.environ.get('JUNIT_OUT', 'reports/junit-nl-suite.xml'))
          if not junit_path.exists():
              print("::error::No JUnit output found; failing CI gate.")
              sys.exit(1)
          def localname(tag: str) -> str:
              return tag.rsplit('}', 1)[-1] if '}' in tag else tag
          tree = ET.parse(junit_path)
          root = tree.getroot()
          suite = root.find('./*') if localname(root.tag) == 'testsuites' else root
          cases = [] if suite is None else list(suite.findall('.//testcase'))
          def id_from_case(tc):
              name = (tc.get('name') or '').strip()
              m = re.match(r'(NL-\d+|T-[A-Z])\b', name)
              if m:
                  return m.group(1)
              so = tc.find('system-out')
              if so is not None and so.text:
                  m = re.search(r'\b(NL-\d+|T-[A-Z])\b', so.text)
                  if m:
                      return m.group(1)
              return None
          # Determine status per desired ID (first occurrence wins, matching the summary builder)
          id_status = {}
          for tc in cases:
              tid = id_from_case(tc)
              if not tid or tid not in desired or tid in id_status:
                  continue
              ok = (tc.find('failure') is None and tc.find('error') is None)
              id_status[tid] = ok
          missing = [d for d in desired if d not in id_status]
          failed = [d for d, ok in id_status.items() if ok is False]
          if missing:
              print(f"::error::Missing NL/T tests in JUnit: {' '.join(missing)}")
          if failed:
              print(f"::error::Failing NL/T tests in JUnit: {' '.join(sorted(failed))}")
          # Gate: all desired must be present and passing
          if missing or failed:
              sys.exit(1)
          print("NL/T CI gate passed: all required tests present and passing.")
          PY
      # ---------- Collect execution transcript (if present) ----------
      - name: Collect action execution transcript
        if: always()