Spaces:
Sleeping
Sleeping
Update app.py
Browse filesfix qp_extraction
app.py
CHANGED
|
@@ -213,32 +213,30 @@ def gemini_generate_content(model, prompt_text, file_upload_obj=None, image_obj=
|
|
| 213 |
return raw_text
|
| 214 |
|
| 215 |
# ---------------- PARSERS ----------------
|
| 216 |
-
def extract_question_ids_from_qpms(text
|
| 217 |
-
"""
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
if
|
| 228 |
-
print(f"✅ Extracted {len(
|
| 229 |
-
print("IDs:",
|
| 230 |
-
return
|
| 231 |
-
|
| 232 |
-
#
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
print("IDs
|
| 239 |
-
|
| 240 |
-
print("⚠️ No question IDs extracted; will send NA placeholder.")
|
| 241 |
-
return ids
|
| 242 |
|
| 243 |
# Update AS prompt builder to include graph detection
|
| 244 |
|
|
|
|
| 213 |
return raw_text
|
| 214 |
|
| 215 |
# ---------------- PARSERS ----------------
|
| 216 |
+
def extract_question_ids_from_qpms(text: str):
|
| 217 |
+
"""Extract question IDs from QP+MS transcript.
|
| 218 |
+
Two-step approach: explicit 'Question X' lines, then fallback numbered lists.
|
| 219 |
+
Robust to hidden whitespace and simple unicode spaces."""
|
| 220 |
+
print("🔎 Extracting question IDs from QP+MS transcript using regex...")
|
| 221 |
+
|
| 222 |
+
# Normalize spaces/tabs/non-breaking spaces
|
| 223 |
+
clean_text = text.replace("\u00A0", " ").replace("\t", " ")
|
| 224 |
+
|
| 225 |
+
# Step 1: Look for explicit "Question X" lines
|
| 226 |
+
primary_matches = re.findall(r"^\s*Question\s*[:\s]\s*([\dA-Za-z.()]+)", clean_text, re.MULTILINE)
|
| 227 |
+
if primary_matches:
|
| 228 |
+
print(f"✅ Extracted {len(primary_matches)} question IDs from explicit 'Question X' lines.")
|
| 229 |
+
print("IDs:", primary_matches)
|
| 230 |
+
return primary_matches
|
| 231 |
+
|
| 232 |
+
# Step 2: Fallback — numbered/sub-question lists
|
| 233 |
+
fallback_matches = re.findall(r"^\s*(\d+(?:[.)]|\([a-zA-Z0-9]+\))?[a-zA-Z0-9]*)", clean_text, re.MULTILINE)
|
| 234 |
+
if fallback_matches:
|
| 235 |
+
print(f"✅ Extracted {len(fallback_matches)} question IDs (fallback numbered lists).")
|
| 236 |
+
print("IDs:", fallback_matches)
|
| 237 |
+
else:
|
| 238 |
+
print("⚠️ No question IDs extracted; will send NA placeholder.")
|
| 239 |
+
return fallback_matches
|
|
|
|
|
|
|
| 240 |
|
| 241 |
# Update AS prompt builder to include graph detection
|
| 242 |
|