TRIAL

Sleeping

atz21 commited on Sep 29, 2025

Commit

27ea33f

verified ·

1 Parent(s): 2dd3b2b

Update app.py

fix qp_extraction

Files changed (1) hide show

app.py CHANGED Viewed

@@ -213,32 +213,30 @@ def gemini_generate_content(model, prompt_text, file_upload_obj=None, image_obj=
     return raw_text
 # ---------------- PARSERS ----------------
-def extract_question_ids_from_qpms(text):
-    """
-    Extract question IDs from QP+MS transcript output.
-    We expect the QP+MS prompt to produce lines like 'Question: <id>'
-    Return a list of IDs in order of appearance, including duplicates.
-    """
-    print("🔎 Extracting question IDs from QP+MS transcript using regex...")
-    ids = []
-    for m in re.finditer(r"(?im)^\s*Question\s*:\s*([0-9]+(?:(?:\.[a-zA-Z0-9]+)+|(?:\([a-zA-Z0-9]+\))+|[a-zA-Z])*)\b", text):
-        qid = m.group(1).strip()
-        ids.append(qid)
-    if ids:
-        print(f"✅ Extracted {len(ids)} question IDs.")
-        print("IDs:", ids)
-        return ids
-    # fallback scans
-    for m in re.finditer(r"(?m)^\s*([0-9]+(?:(?:\.[a-zA-Z0-9]+)+|(?:\([a-zA-Z0-9]+\))+|[a-zA-Z])*)\s*[\.\):\-]\s", text):
-        qid = m.group(1).strip()
-        ids.append(qid)
-    if ids:
-        print(f"✅ Extracted {len(ids)} question IDs (fallback heuristic).")
-        print("IDs:", ids)
-    else:
-        print("⚠️ No question IDs extracted; will send NA placeholder.")
-    return ids
 # Update AS prompt builder to include graph detection

     return raw_text
 # ---------------- PARSERS ----------------
+def extract_question_ids_from_qpms(text: str):
+    """Extract question IDs from QP+MS transcript.
+    Two-step approach: explicit 'Question X' lines, then fallback numbered lists.
+    Robust to hidden whitespace and simple unicode spaces."""
+    print("🔎 Extracting question IDs from QP+MS transcript using regex...")
+    # Normalize spaces/tabs/non-breaking spaces
+    clean_text = text.replace("\u00A0", " ").replace("\t", " ")
+    # Step 1: Look for explicit "Question X" lines
+    primary_matches = re.findall(r"^\s*Question\s*[:\s]\s*([\dA-Za-z.()]+)", clean_text, re.MULTILINE)
+    if primary_matches:
+        print(f"✅ Extracted {len(primary_matches)} question IDs from explicit 'Question X' lines.")
+        print("IDs:", primary_matches)
+        return primary_matches
+    # Step 2: Fallback — numbered/sub-question lists
+    fallback_matches = re.findall(r"^\s*(\d+(?:[.)]|\([a-zA-Z0-9]+\))?[a-zA-Z0-9]*)", clean_text, re.MULTILINE)
+    if fallback_matches:
+        print(f"✅ Extracted {len(fallback_matches)} question IDs (fallback numbered lists).")
+        print("IDs:", fallback_matches)
+    else:
+        print("⚠️ No question IDs extracted; will send NA placeholder.")
+    return fallback_matches
 # Update AS prompt builder to include graph detection