atz21 commited on
Commit
27ea33f
·
verified ·
1 Parent(s): 2dd3b2b

Update app.py

Browse files

fix qp_extraction

Files changed (1) hide show
  1. app.py +24 -26
app.py CHANGED
@@ -213,32 +213,30 @@ def gemini_generate_content(model, prompt_text, file_upload_obj=None, image_obj=
213
  return raw_text
214
 
215
  # ---------------- PARSERS ----------------
216
- def extract_question_ids_from_qpms(text):
217
- """
218
- Extract question IDs from QP+MS transcript output.
219
- We expect the QP+MS prompt to produce lines like 'Question: <id>'
220
- Return a list of IDs in order of appearance, including duplicates.
221
- """
222
- print("🔎 Extracting question IDs from QP+MS transcript using regex...")
223
- ids = []
224
- for m in re.finditer(r"(?im)^\s*Question\s*:\s*([0-9]+(?:(?:\.[a-zA-Z0-9]+)+|(?:\([a-zA-Z0-9]+\))+|[a-zA-Z])*)\b", text):
225
- qid = m.group(1).strip()
226
- ids.append(qid)
227
- if ids:
228
- print(f"✅ Extracted {len(ids)} question IDs.")
229
- print("IDs:", ids)
230
- return ids
231
-
232
- # fallback scans
233
- for m in re.finditer(r"(?m)^\s*([0-9]+(?:(?:\.[a-zA-Z0-9]+)+|(?:\([a-zA-Z0-9]+\))+|[a-zA-Z])*)\s*[\.\):\-]\s", text):
234
- qid = m.group(1).strip()
235
- ids.append(qid)
236
- if ids:
237
- print(f"✅ Extracted {len(ids)} question IDs (fallback heuristic).")
238
- print("IDs:", ids)
239
- else:
240
- print("⚠️ No question IDs extracted; will send NA placeholder.")
241
- return ids
242
 
243
  # Update AS prompt builder to include graph detection
244
 
 
213
  return raw_text
214
 
215
  # ---------------- PARSERS ----------------
216
+ def extract_question_ids_from_qpms(text: str):
217
+ """Extract question IDs from QP+MS transcript.
218
+ Two-step approach: explicit 'Question X' lines, then fallback numbered lists.
219
+ Robust to hidden whitespace and simple unicode spaces."""
220
+ print("🔎 Extracting question IDs from QP+MS transcript using regex...")
221
+
222
+ # Normalize spaces/tabs/non-breaking spaces
223
+ clean_text = text.replace("\u00A0", " ").replace("\t", " ")
224
+
225
+ # Step 1: Look for explicit "Question X" lines
226
+ primary_matches = re.findall(r"^\s*Question\s*[:\s]\s*([\dA-Za-z.()]+)", clean_text, re.MULTILINE)
227
+ if primary_matches:
228
+ print(f"✅ Extracted {len(primary_matches)} question IDs from explicit 'Question X' lines.")
229
+ print("IDs:", primary_matches)
230
+ return primary_matches
231
+
232
+ # Step 2: Fallback — numbered/sub-question lists
233
+ fallback_matches = re.findall(r"^\s*(\d+(?:[.)]|\([a-zA-Z0-9]+\))?[a-zA-Z0-9]*)", clean_text, re.MULTILINE)
234
+ if fallback_matches:
235
+ print(f"✅ Extracted {len(fallback_matches)} question IDs (fallback numbered lists).")
236
+ print("IDs:", fallback_matches)
237
+ else:
238
+ print("⚠️ No question IDs extracted; will send NA placeholder.")
239
+ return fallback_matches
 
 
240
 
241
  # Update AS prompt builder to include graph detection
242