Spaces:

roshcheeku
/

mcq2vid

Sleeping

App Files Files Community

roshcheeku commited on Jun 14, 2025

Commit

a676c64

verified ·

1 Parent(s): 5e01a8b

Update model_utils.py

Browse files

Files changed (1) hide show

model_utils.py +71 -190

model_utils.py CHANGED Viewed

@@ -1,208 +1,89 @@
-# model_utils.py (enhanced with OCR, robust regex)
 import os
 import re
 import pandas as pd
-from transformers import pipeline
-import pytesseract
-from PIL import Image
-import pdf2image
-# Set Hugging Face cache directory
-os.environ["HF_HOME"] = "/tmp/hf_cache"
-os.makedirs("/tmp/hf_cache", exist_ok=True)
-# Zero-shot classification pipeline (fallback model)
-classifier = pipeline(
-    "zero-shot-classification",
-    model="typeform/distilbert-base-uncased-mnli",
-    device=-1  # CPU
-)
-labels = ["question", "option", "answer", "other"]
-CONFIDENCE_THRESHOLD = 0.6
-def clean_option(option: str) -> str:
-    return re.sub(r"^[A-Z0-9][\.\)\]]?\s*", "", str(option).strip())
-# ========================
-# OCR Fallback for PDF
-# ========================
-def extract_text_from_pdf(filepath):
-    try:
-        import pdfplumber
-        with pdfplumber.open(filepath) as pdf:
-            text = "\n".join([p.extract_text() or "" for p in pdf.pages])
-            if text.strip():
-                return text
-    except Exception as e:
-        print(f"pdfplumber error: {e}")
-    print("🔍 Falling back to OCR...")
-    images = pdf2image.convert_from_path(filepath)
-    ocr_text = "\n".join([pytesseract.image_to_string(img) for img in images])
-    return ocr_text
-# ========================
-# MCQ Extraction from Structured Files
-# ========================
-def extract_mcqs_from_structured_file(filepath: str):
-    if filepath.endswith(".csv"):
-        df = pd.read_csv(filepath)
-    else:
-        df = pd.read_excel(filepath)
-    mcqs = []
-    for _, row in df.iterrows():
-        if pd.isna(row.get("Question")):
-            continue
-        options = []
-        for col in ["Option A", "Option B", "Option C", "Option D"]:
-            if col in row:
-                opt = clean_option(row.get(col, ""))
-                if opt:
-                    options.append(opt)
-        correct = str(row.get("Correct Answer", "")).strip()
-        if not correct and pd.notna(row.get("Correct Option", "")):
-            opt_map = {"A": 0, "B": 1, "C": 2, "D": 3}
-            idx = opt_map.get(str(row["Correct Option"]).strip().upper(), 0)
-            correct = options[idx] if idx < len(options) else ""
-        correct = clean_option(correct)
-        mcqs.append({
-            "question": str(row["Question"]).strip(),
-            "options": options,
-            "answer": correct
-        })
-    return mcqs
-# ========================
-# Regex-Based MCQ Extraction
-# ========================
-def normalize_text(text: str) -> str:
-    text = re.sub(r"(?m)^\s*([IVXLC\d]{1,3})[\.\-]\s*", r"\1) ", text)
-    text = re.sub(r"(?m)^[ \t]*[\(\[]?([A-Za-z0-9])[\)\]\.\-:]?\s*", r"\1. ", text)
-    text = re.sub(r"(?i)(Answer|Correct Answer|ANS)[\s:\-→]*\(?([A-Z0-9])\)?[^\S\r\n]*is[^\S\r\n]*correct\.?", r"Answer (\2) is correct.", text)
-    lines = text.splitlines()
-    clean_lines = []
-    seen = {}
-    for ln in lines:
-        key = ln.strip()
-        if len(key.split()) < 3:
-            seen[key] = seen.get(key, 0) + 1
-            if seen[key] > 2:
-                continue
-        clean_lines.append(ln)
-    merged = []
-    for ln in clean_lines:
-        if re.match(r"^\s*\d{1,3}\)\s+|^[A-Z0-9][\.\)]\s+", ln):
-            merged.append(ln)
         else:
-            if merged:
-                merged[-1] += " " + ln.strip()
-            else:
-                merged.append(ln)
-    return "\n".join(merged)
-def extract_mcqs_regex(text: str):
-    text = normalize_text(text)
     mcqs = []
-    segments = re.split(r"(?=\n?\d{1,3}\)\s+)", text)
-    for seg in segments:
-        qm = re.match(r"\s*(?:Q[:\.\)]?\s*)?(\d{1,3}\))?\s*([^
-]+)", seg)
-        if not qm:
-            continue
-        question = (qm.group(1) or "") + " " + qm.group(2).strip()
-        opts = []
-        for ln in seg.splitlines():
-            om = re.match(r"^\s*[\(\[]?([A-Z0-9])[\)\.\]]?\s*[-:]?\s*(.+)", ln)
-            if om:
-                opts.append((om.group(1).upper(), clean_option(om.group(2))))
-        if len(opts) < 2:
             continue
-        am = re.search(r"(?i)(Answer|Correct Option|Correct Answer|Ans)\s*[:\-]?\s*\(?([A-Z0-9])\)?", seg)
-        if not am:
             continue
-        ans_letter = am.group(2).upper()
-        letter_map = {L: T for L, T in opts}
-        if ans_letter not in letter_map:
             continue
-        sorted_opts = [letter_map[L] for L in sorted(letter_map.keys())]
         mcqs.append({
-            "question": question,
-            "options": sorted_opts,
-            "answer": letter_map[ans_letter]
         })
     return mcqs
-# ========================
-# Zero-Shot MCQ Classifier Fallback
-# ========================
-def classify_chunks(chunks):
-    results = classifier(chunks, labels)
-    top_labels = []
-    for res in results:
-        label = res["labels"][0]
-        score = res["scores"][0]
-        top_labels.append(label if score >= CONFIDENCE_THRESHOLD else "other")
-    return top_labels
-def extract_mcqs_with_zero_shot(text: str):
-    chunks = [c.strip() for c in text.split("\n\n") if c.strip()]
-    predicted = classify_chunks(chunks)
-    mcqs, current = [], {"question": "", "options": [], "answer": ""}
-    for chunk, lab in zip(chunks, predicted):
-        if lab == "question":
-            if current["question"]:
-                current["options"] = [clean_option(o) for o in current["options"]]
-                current["answer"] = clean_option(current["answer"] or current["options"][0])
-                mcqs.append(current)
-                current = {"question": "", "options": [], "answer": ""}
-            current["question"] = chunk
-        elif lab == "option":
-            current["options"].append(chunk)
-        elif lab == "answer":
-            current["answer"] = chunk
-    if current["question"]:
-        current["options"] = [clean_option(o) for o in current["options"]]
-        current["answer"] = clean_option(current["answer"] or current["options"][0])
-        mcqs.append(current)
-    return mcqs
-# ========================
-# Master Wrapper
-# ========================
-def extract_mcqs_from_file(filepath: str, raw_text: str = None):
-    ext = os.path.splitext(filepath)[-1].lower()
-    if ext in ['.xls', '.xlsx', '.csv']:
-        return extract_mcqs_from_structured_file(filepath)
-    elif raw_text:
-        mcqs = extract_mcqs_regex(raw_text)
-        if len(mcqs) < 5:
-            print("🔁 Regex fallback insufficient. Using zero-shot.")
-            mcqs.extend(extract_mcqs_with_zero_shot(raw_text))
-        return mcqs
-    else:
-        return []

+# model_utils.py
 import os
 import re
 import pandas as pd
+def extract_mcqs_from_file(filepath, raw_text=None):
+    if not raw_text:
+        ext = filepath.rsplit(".", 1)[-1].lower()
+        if ext == 'pdf':
+            import pdfplumber
+            text = []
+            with pdfplumber.open(filepath) as pdf:
+                for page in pdf.pages:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text.append(page_text)
+            raw_text = "\n".join(text)
+        elif ext == 'docx':
+            from docx import Document
+            doc = Document(filepath)
+            raw_text = "\n".join([p.text for p in doc.paragraphs])
+        elif ext in ['xls', 'xlsx']:
+            df = pd.read_excel(filepath)
+            return df.to_dict(orient='records')
+        elif ext == 'csv':
+            df = pd.read_csv(filepath)
+            return df.to_dict(orient='records')
         else:
+            raise ValueError("Unsupported file format")
     mcqs = []
+    question = ""
+    options = []
+    answer = ""
+    explanation = ""
+    lines = raw_text.splitlines()
+    for i, line in enumerate(lines):
+        line = line.strip()
+        # Identify questions
+        qm = re.match(r"\s*(?:Q[:\.\)]?\s*)?(\d{1,3}\))?\s*(.*?)(?:\?|\n|$)", line)
+        if qm and len(line.split()) > 3:
+            if question:
+                mcqs.append({
+                    'question': question.strip(),
+                    'options': options,
+                    'answer': answer,
+                    'explanation': explanation
+                })
+                options = []
+                answer = ""
+                explanation = ""
+            question = qm.group(2).strip()
             continue
+        # Identify options (A, B, C, D etc.)
+        opt = re.match(r"^(?:[a-dA-D][\)\.]|[\(]?[a-dA-D][\)])\s+(.*)", line)
+        if opt:
+            options.append(opt.group(1).strip())
             continue
+        # Identify answer
+        ans = re.match(r"^(Answer|Ans|Correct answer)[:\-\s]*([a-dA-D])", line, re.IGNORECASE)
+        if ans:
+            answer = ans.group(2).upper()
             continue
+        # Identify explanation
+        exp = re.match(r"^(Explanation|Why|Because)[:\-\s]*(.*)", line, re.IGNORECASE)
+        if exp:
+            explanation = exp.group(2).strip()
+            # Accumulate further explanation lines
+            j = i + 1
+            while j < len(lines) and lines[j].strip() and not re.match(r"^Q|\d+[\)\.]", lines[j]):
+                explanation += " " + lines[j].strip()
+                j += 1
+    # Append last MCQ if exists
+    if question:
         mcqs.append({
+            'question': question.strip(),
+            'options': options,
+            'answer': answer,
+            'explanation': explanation
         })
     return mcqs