roshcheeku commited on
Commit
a676c64
·
verified ·
1 Parent(s): 5e01a8b

Update model_utils.py

Browse files
Files changed (1) hide show
  1. model_utils.py +71 -190
model_utils.py CHANGED
@@ -1,208 +1,89 @@
1
- # model_utils.py (enhanced with OCR, robust regex)
2
 
3
  import os
4
  import re
5
  import pandas as pd
6
- from transformers import pipeline
7
- import pytesseract
8
- from PIL import Image
9
- import pdf2image
10
 
11
- # Set Hugging Face cache directory
12
- os.environ["HF_HOME"] = "/tmp/hf_cache"
13
- os.makedirs("/tmp/hf_cache", exist_ok=True)
14
-
15
- # Zero-shot classification pipeline (fallback model)
16
- classifier = pipeline(
17
- "zero-shot-classification",
18
- model="typeform/distilbert-base-uncased-mnli",
19
- device=-1 # CPU
20
- )
21
-
22
- labels = ["question", "option", "answer", "other"]
23
- CONFIDENCE_THRESHOLD = 0.6
24
-
25
-
26
- def clean_option(option: str) -> str:
27
- return re.sub(r"^[A-Z0-9][\.\)\]]?\s*", "", str(option).strip())
28
-
29
-
30
- # ========================
31
- # OCR Fallback for PDF
32
- # ========================
33
- def extract_text_from_pdf(filepath):
34
- try:
35
- import pdfplumber
36
- with pdfplumber.open(filepath) as pdf:
37
- text = "\n".join([p.extract_text() or "" for p in pdf.pages])
38
- if text.strip():
39
- return text
40
- except Exception as e:
41
- print(f"pdfplumber error: {e}")
42
-
43
- print("🔍 Falling back to OCR...")
44
- images = pdf2image.convert_from_path(filepath)
45
- ocr_text = "\n".join([pytesseract.image_to_string(img) for img in images])
46
- return ocr_text
47
-
48
-
49
- # ========================
50
- # MCQ Extraction from Structured Files
51
- # ========================
52
- def extract_mcqs_from_structured_file(filepath: str):
53
- if filepath.endswith(".csv"):
54
- df = pd.read_csv(filepath)
55
- else:
56
- df = pd.read_excel(filepath)
57
-
58
- mcqs = []
59
- for _, row in df.iterrows():
60
- if pd.isna(row.get("Question")):
61
- continue
62
-
63
- options = []
64
- for col in ["Option A", "Option B", "Option C", "Option D"]:
65
- if col in row:
66
- opt = clean_option(row.get(col, ""))
67
- if opt:
68
- options.append(opt)
69
-
70
- correct = str(row.get("Correct Answer", "")).strip()
71
- if not correct and pd.notna(row.get("Correct Option", "")):
72
- opt_map = {"A": 0, "B": 1, "C": 2, "D": 3}
73
- idx = opt_map.get(str(row["Correct Option"]).strip().upper(), 0)
74
- correct = options[idx] if idx < len(options) else ""
75
- correct = clean_option(correct)
76
-
77
- mcqs.append({
78
- "question": str(row["Question"]).strip(),
79
- "options": options,
80
- "answer": correct
81
- })
82
-
83
- return mcqs
84
-
85
-
86
- # ========================
87
- # Regex-Based MCQ Extraction
88
- # ========================
89
- def normalize_text(text: str) -> str:
90
- text = re.sub(r"(?m)^\s*([IVXLC\d]{1,3})[\.\-]\s*", r"\1) ", text)
91
- text = re.sub(r"(?m)^[ \t]*[\(\[]?([A-Za-z0-9])[\)\]\.\-:]?\s*", r"\1. ", text)
92
- text = re.sub(r"(?i)(Answer|Correct Answer|ANS)[\s:\-→]*\(?([A-Z0-9])\)?[^\S\r\n]*is[^\S\r\n]*correct\.?", r"Answer (\2) is correct.", text)
93
-
94
- lines = text.splitlines()
95
- clean_lines = []
96
- seen = {}
97
- for ln in lines:
98
- key = ln.strip()
99
- if len(key.split()) < 3:
100
- seen[key] = seen.get(key, 0) + 1
101
- if seen[key] > 2:
102
- continue
103
- clean_lines.append(ln)
104
-
105
- merged = []
106
- for ln in clean_lines:
107
- if re.match(r"^\s*\d{1,3}\)\s+|^[A-Z0-9][\.\)]\s+", ln):
108
- merged.append(ln)
109
  else:
110
- if merged:
111
- merged[-1] += " " + ln.strip()
112
- else:
113
- merged.append(ln)
114
- return "\n".join(merged)
115
 
116
-
117
- def extract_mcqs_regex(text: str):
118
- text = normalize_text(text)
119
  mcqs = []
120
- segments = re.split(r"(?=\n?\d{1,3}\)\s+)", text)
121
-
122
- for seg in segments:
123
- qm = re.match(r"\s*(?:Q[:\.\)]?\s*)?(\d{1,3}\))?\s*([^
124
- ]+)", seg)
125
- if not qm:
126
- continue
127
- question = (qm.group(1) or "") + " " + qm.group(2).strip()
128
-
129
- opts = []
130
- for ln in seg.splitlines():
131
- om = re.match(r"^\s*[\(\[]?([A-Z0-9])[\)\.\]]?\s*[-:]?\s*(.+)", ln)
132
- if om:
133
- opts.append((om.group(1).upper(), clean_option(om.group(2))))
134
- if len(opts) < 2:
 
 
 
 
 
 
 
 
135
  continue
136
 
137
- am = re.search(r"(?i)(Answer|Correct Option|Correct Answer|Ans)\s*[:\-]?\s*\(?([A-Z0-9])\)?", seg)
138
- if not am:
 
 
139
  continue
140
- ans_letter = am.group(2).upper()
141
 
142
- letter_map = {L: T for L, T in opts}
143
- if ans_letter not in letter_map:
 
 
144
  continue
145
 
146
- sorted_opts = [letter_map[L] for L in sorted(letter_map.keys())]
 
 
 
 
 
 
 
 
 
 
 
147
  mcqs.append({
148
- "question": question,
149
- "options": sorted_opts,
150
- "answer": letter_map[ans_letter]
 
151
  })
152
 
153
  return mcqs
154
-
155
-
156
- # ========================
157
- # Zero-Shot MCQ Classifier Fallback
158
- # ========================
159
- def classify_chunks(chunks):
160
- results = classifier(chunks, labels)
161
- top_labels = []
162
- for res in results:
163
- label = res["labels"][0]
164
- score = res["scores"][0]
165
- top_labels.append(label if score >= CONFIDENCE_THRESHOLD else "other")
166
- return top_labels
167
-
168
-
169
- def extract_mcqs_with_zero_shot(text: str):
170
- chunks = [c.strip() for c in text.split("\n\n") if c.strip()]
171
- predicted = classify_chunks(chunks)
172
-
173
- mcqs, current = [], {"question": "", "options": [], "answer": ""}
174
- for chunk, lab in zip(chunks, predicted):
175
- if lab == "question":
176
- if current["question"]:
177
- current["options"] = [clean_option(o) for o in current["options"]]
178
- current["answer"] = clean_option(current["answer"] or current["options"][0])
179
- mcqs.append(current)
180
- current = {"question": "", "options": [], "answer": ""}
181
- current["question"] = chunk
182
- elif lab == "option":
183
- current["options"].append(chunk)
184
- elif lab == "answer":
185
- current["answer"] = chunk
186
- if current["question"]:
187
- current["options"] = [clean_option(o) for o in current["options"]]
188
- current["answer"] = clean_option(current["answer"] or current["options"][0])
189
- mcqs.append(current)
190
-
191
- return mcqs
192
-
193
-
194
- # ========================
195
- # Master Wrapper
196
- # ========================
197
- def extract_mcqs_from_file(filepath: str, raw_text: str = None):
198
- ext = os.path.splitext(filepath)[-1].lower()
199
- if ext in ['.xls', '.xlsx', '.csv']:
200
- return extract_mcqs_from_structured_file(filepath)
201
- elif raw_text:
202
- mcqs = extract_mcqs_regex(raw_text)
203
- if len(mcqs) < 5:
204
- print("🔁 Regex fallback insufficient. Using zero-shot.")
205
- mcqs.extend(extract_mcqs_with_zero_shot(raw_text))
206
- return mcqs
207
- else:
208
- return []
 
1
+ # model_utils.py
2
 
3
  import os
4
  import re
5
  import pandas as pd
 
 
 
 
6
 
7
+ def extract_mcqs_from_file(filepath, raw_text=None):
8
+ if not raw_text:
9
+ ext = filepath.rsplit(".", 1)[-1].lower()
10
+ if ext == 'pdf':
11
+ import pdfplumber
12
+ text = []
13
+ with pdfplumber.open(filepath) as pdf:
14
+ for page in pdf.pages:
15
+ page_text = page.extract_text()
16
+ if page_text:
17
+ text.append(page_text)
18
+ raw_text = "\n".join(text)
19
+ elif ext == 'docx':
20
+ from docx import Document
21
+ doc = Document(filepath)
22
+ raw_text = "\n".join([p.text for p in doc.paragraphs])
23
+ elif ext in ['xls', 'xlsx']:
24
+ df = pd.read_excel(filepath)
25
+ return df.to_dict(orient='records')
26
+ elif ext == 'csv':
27
+ df = pd.read_csv(filepath)
28
+ return df.to_dict(orient='records')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  else:
30
+ raise ValueError("Unsupported file format")
 
 
 
 
31
 
 
 
 
32
  mcqs = []
33
+ question = ""
34
+ options = []
35
+ answer = ""
36
+ explanation = ""
37
+
38
+ lines = raw_text.splitlines()
39
+ for i, line in enumerate(lines):
40
+ line = line.strip()
41
+
42
+ # Identify questions
43
+ qm = re.match(r"\s*(?:Q[:\.\)]?\s*)?(\d{1,3}\))?\s*(.*?)(?:\?|\n|$)", line)
44
+ if qm and len(line.split()) > 3:
45
+ if question:
46
+ mcqs.append({
47
+ 'question': question.strip(),
48
+ 'options': options,
49
+ 'answer': answer,
50
+ 'explanation': explanation
51
+ })
52
+ options = []
53
+ answer = ""
54
+ explanation = ""
55
+ question = qm.group(2).strip()
56
  continue
57
 
58
+ # Identify options (A, B, C, D etc.)
59
+ opt = re.match(r"^(?:[a-dA-D][\)\.]|[\(]?[a-dA-D][\)])\s+(.*)", line)
60
+ if opt:
61
+ options.append(opt.group(1).strip())
62
  continue
 
63
 
64
+ # Identify answer
65
+ ans = re.match(r"^(Answer|Ans|Correct answer)[:\-\s]*([a-dA-D])", line, re.IGNORECASE)
66
+ if ans:
67
+ answer = ans.group(2).upper()
68
  continue
69
 
70
+ # Identify explanation
71
+ exp = re.match(r"^(Explanation|Why|Because)[:\-\s]*(.*)", line, re.IGNORECASE)
72
+ if exp:
73
+ explanation = exp.group(2).strip()
74
+ # Accumulate further explanation lines
75
+ j = i + 1
76
+ while j < len(lines) and lines[j].strip() and not re.match(r"^Q|\d+[\)\.]", lines[j]):
77
+ explanation += " " + lines[j].strip()
78
+ j += 1
79
+
80
+ # Append last MCQ if exists
81
+ if question:
82
  mcqs.append({
83
+ 'question': question.strip(),
84
+ 'options': options,
85
+ 'answer': answer,
86
+ 'explanation': explanation
87
  })
88
 
89
  return mcqs