Chaitu2112 commited on
Commit
ba2818e
·
verified ·
1 Parent(s): c40bad1

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +1536 -0
app.py ADDED
@@ -0,0 +1,1536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # backend.py
3
+ import uvicorn
4
+ from fastapi import FastAPI, UploadFile, File, Form
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ from fastapi.responses import JSONResponse, StreamingResponse, FileResponse, HTMLResponse
7
+ from fastapi.staticfiles import StaticFiles
8
+ import tempfile, io, os, re, json, base64, hashlib
9
+ from typing import List, Tuple, Dict
10
+ import fitz # PyMuPDF
11
+ import requests
12
+ import pandas as pd
13
+ from docx import Document
14
+ from io import BytesIO
15
+
16
+ from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, Boolean
17
+ from sqlalchemy.ext.declarative import declarative_base
18
+ from sqlalchemy.orm import sessionmaker
19
+ import datetime
20
+
21
+ from urllib.parse import quote_plus
22
+ MYSQL_USER = "root"
23
+ MYSQL_PASSWORD = "root@MySQL4admin"
24
+ MYSQL_HOST = "localhost"
25
+ MYSQL_PORT = 3306
26
+ MYSQL_DB = "mcq_db"
27
+
28
+ # URL encode the password
29
+ encoded_password = quote_plus(MYSQL_PASSWORD)
30
+
31
+ from sqlalchemy import create_engine
32
+ from sqlalchemy.orm import sessionmaker, declarative_base
33
+ import os
34
+
35
+ # Use SQLite instead of MySQL
36
+ DATABASE_URL = "sqlite:///./app.db"
37
+
38
+ engine = create_engine(
39
+ DATABASE_URL,
40
+ connect_args={"check_same_thread": False} # Needed for SQLite
41
+ )
42
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
43
+ from sqlalchemy.orm import declarative_base
44
+ Base = declarative_base()
45
+
46
+ class Question(Base):
47
+ __tablename__ = "questions"
48
+
49
+ id = Column(Integer, primary_key=True, index=True)
50
+ topic = Column(String(255))
51
+ type = Column(String(20)) # MCQ / Descriptive
52
+ question = Column(Text, nullable=False)
53
+ option_a = Column(Text)
54
+ option_b = Column(Text)
55
+ option_c = Column(Text)
56
+ option_d = Column(Text)
57
+ answer = Column(Text)
58
+ descriptive_answer = Column(Text)
59
+ difficulty = Column(String(10))
60
+ created_at = Column(DateTime, default=datetime.datetime.utcnow)
61
+ flagged = Column(Boolean, default=None) # Change from True to None
62
+
63
+ # Create table if not exists
64
+ Base.metadata.create_all(bind=engine)
65
+
66
+
67
+
68
+
69
+ import json
70
+
71
+ def save_questions_to_db(results: dict):
72
+ """
73
+ Save parsed results into the questions table.
74
+ Expected `results` structure:
75
+ {
76
+ "Topic Name": {
77
+ "mcqs": [ { "question": "...", "options": [...], "answer": "A", "difficulty": 2 }, ... ],
78
+ "descriptive": [ { "question": "...", "answer": "...", "difficulty": 3 }, ... ]
79
+ },
80
+ ...
81
+ }
82
+ The function is defensive: it skips entries missing the required 'question' text
83
+ and logs skipped items.
84
+ """
85
+ db = SessionLocal()
86
+ saved = 0
87
+ skipped = 0
88
+
89
+ try:
90
+ # optional: quick debug dump if things keep failing
91
+ # print("DEBUG save_questions_to_db incoming:", json.dumps(results)[:2000])
92
+
93
+ for topic, data in (results or {}).items():
94
+ # normalize topic value (some callers send topic None)
95
+ topic_val = topic if topic is not None else None
96
+
97
+ # Save MCQs
98
+ for mcq in data.get("mcqs", []) if data else []:
99
+ # robust extraction of fields
100
+ question_text = mcq.get("question") or mcq.get("q") or None
101
+ if not question_text or not str(question_text).strip():
102
+ print("⚠️ Skipping MCQ with no question text:", mcq)
103
+ skipped += 1
104
+ continue
105
+
106
+ opts = mcq.get("options", []) or []
107
+ option_a = opts[0] if len(opts) > 0 else mcq.get("option_a") or None
108
+ option_b = opts[1] if len(opts) > 1 else mcq.get("option_b") or None
109
+ option_c = opts[2] if len(opts) > 2 else mcq.get("option_c") or None
110
+ option_d = opts[3] if len(opts) > 3 else mcq.get("option_d") or None
111
+
112
+ answer = mcq.get("answer") or mcq.get("ans") or None
113
+ difficulty = mcq.get("difficulty")
114
+ difficulty = str(difficulty) if difficulty is not None else None
115
+
116
+ q = Question(
117
+ topic=topic_val,
118
+ type="MCQ",
119
+ question=str(question_text).strip(),
120
+ option_a=option_a,
121
+ option_b=option_b,
122
+ option_c=option_c,
123
+ option_d=option_d,
124
+ answer=answer,
125
+ descriptive_answer=None,
126
+ difficulty=difficulty,
127
+ created_at=datetime.datetime.utcnow(),
128
+ flagged=None # pending by default
129
+ )
130
+ db.add(q)
131
+ saved += 1
132
+
133
+ # Save Descriptive
134
+ for dq in data.get("descriptive", []) if data else []:
135
+ question_text = dq.get("question") or dq.get("q") or None
136
+ if not question_text or not str(question_text).strip():
137
+ print("⚠️ Skipping Descriptive with no question text:", dq)
138
+ skipped += 1
139
+ continue
140
+
141
+ descriptive_answer = dq.get("answer") or dq.get("descriptive_answer") or None
142
+ difficulty = dq.get("difficulty")
143
+ difficulty = str(difficulty) if difficulty is not None else None
144
+
145
+ q = Question(
146
+ topic=topic_val,
147
+ type="Descriptive",
148
+ question=str(question_text).strip(),
149
+ option_a=None,
150
+ option_b=None,
151
+ option_c=None,
152
+ option_d=None,
153
+ answer=None,
154
+ descriptive_answer=descriptive_answer,
155
+ difficulty=difficulty,
156
+ created_at=datetime.datetime.utcnow(),
157
+ flagged=None
158
+ )
159
+ db.add(q)
160
+ saved += 1
161
+
162
+ db.commit()
163
+
164
+ return {"status": "success", "saved": saved, "skipped": skipped}
165
+
166
+ except Exception as e:
167
+ db.rollback()
168
+ print("❌ DB error in save_questions_to_db:", e)
169
+ # optional: raise or return an error dict
170
+ return {"status": "error", "error": str(e)}
171
+ finally:
172
+ db.close()
173
+
174
+
175
+
176
+ # ---------- CONFIG ----------
177
+
178
+ from dotenv import load_dotenv
179
+ load_dotenv()
180
+ # OpenRouter Configuration
181
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "") # Set your API key in environment variable
182
+ OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
183
+ OPENROUTER_MODEL = "meta-llama/llama-3.3-70b-instruct:free" # Free model, you can change this
184
+
185
+ # Headers for OpenRouter API
186
+ OPENROUTER_HEADERS = {
187
+ "Authorization": f"Bearer {OPENROUTER_API_KEY}",
188
+ "Content-Type": "application/json",
189
+ "HTTP-Referer": "http://localhost:8000", # Optional: your site URL
190
+ "X-Title": "MCQ Generator" # Optional: your app name
191
+ }
192
+
193
+ MODEL = OPENROUTER_MODEL
194
+
195
+
196
+ HOST = "127.0.0.1"
197
+ PORT = 8000
198
+ # ---------- FASTAPI ----------
199
+ app = FastAPI()
200
+
201
+
202
+
203
+
204
+ # HTML_PATH = "design.html"
205
+
206
+ # @app.get("/")
207
+ # async def read_root():
208
+ # return FileResponse(HTML_PATH)
209
+
210
+
211
+ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], allow_credentials=True)
212
+
213
+ # Serve static files (put design.html and any assets inside ./static/)
214
+ static_dir = os.path.join(os.path.dirname(__file__), "static")
215
+ if not os.path.isdir(static_dir):
216
+ os.makedirs(static_dir, exist_ok=True)
217
+ app.mount("/static", StaticFiles(directory=static_dir), name="static")
218
+
219
+ # Serve design.html at root
220
+ @app.get("/", response_class=HTMLResponse)
221
+ async def index():
222
+ fpath = os.path.join(static_dir, "design.html")
223
+ if os.path.exists(fpath):
224
+ return HTMLResponse(open(fpath, "r", encoding="utf-8").read())
225
+ return HTMLResponse("<h3>Place design.html inside ./static/ and reload.</h3>")
226
+
227
+ # ---------- IN-MEMORY STATE & STORE ----------
228
+ IN_MEMORY_STORE = {} # key -> {"data": bytes, "name": str, "mime": str}
229
+ STATE = {
230
+ "pdf_uploads": 0,
231
+ "last_pdf_hash": None,
232
+ "last_pdf_pages": 0,
233
+ "mcq_count": 0,
234
+ "desc_count": 0
235
+ }
236
+
237
+ def store_result_bytes(key: str, data: bytes, filename: str, mime: str):
238
+ IN_MEMORY_STORE[key] = {"data": data, "name": filename, "mime": mime}
239
+
240
+ @app.get("/download/{key}")
241
+ async def download_key(key: str):
242
+ item = IN_MEMORY_STORE.get(key)
243
+ if not item:
244
+ return JSONResponse({"error": "Not found"}, status_code=404)
245
+ return StreamingResponse(io.BytesIO(item["data"]), media_type=item["mime"],
246
+ headers={"Content-Disposition": f"attachment; filename={item['name']}"})
247
+
248
+ @app.get("/status")
249
+ async def status():
250
+ """Return counters for the top dashboard (PDF uploads, pages, counts)."""
251
+ return {
252
+ "pdf_uploads": STATE["pdf_uploads"],
253
+ "last_pdf_pages": STATE["last_pdf_pages"],
254
+ "mcq_count": STATE["mcq_count"],
255
+ "desc_count": STATE["desc_count"]
256
+ }
257
+
258
+ # ---------- UTIL HELPERS (ported from your Streamlit code) ----------
259
+ def clean_text(text: str) -> str:
260
+ if text is None:
261
+ return ""
262
+ return re.sub(r"[\x00-\x1F\x7F]", "", str(text))
263
+
264
+ def detect_index_range(doc, min_section_hits: int = 3, consecutive_break: int = 2) -> Tuple[int, int]:
265
+ scores = []
266
+ has_contents_flags = []
267
+ for pno in range(doc.page_count):
268
+ try:
269
+ text = doc.load_page(pno).get_text("text") or ""
270
+ except Exception:
271
+ text = ""
272
+ low = text.lower()
273
+ has_contents = bool(re.search(r"\btable of contents\b|\bcontents\b", low))
274
+ count_sections = len(re.findall(r"\b\d{1,2}\.\d+\b", text))
275
+ count_leaders = len(re.findall(r"\.{2,}\s*\d+|\s+\d{1,3}\s*$", text, re.M))
276
+ score = count_sections + 0.6 * count_leaders + (5 if has_contents else 0)
277
+ scores.append(score)
278
+ has_contents_flags.append(has_contents)
279
+
280
+ if any(has_contents_flags):
281
+ start_idx = next(i for i, f in enumerate(has_contents_flags) if f)
282
+ end_idx = start_idx
283
+ break_count = 0
284
+ for i in range(start_idx + 1, len(scores)):
285
+ if scores[i] >= 1.0:
286
+ end_idx = i
287
+ break_count = 0
288
+ else:
289
+ break_count += 1
290
+ if break_count >= consecutive_break:
291
+ break
292
+ return (start_idx + 1, end_idx + 1)
293
+
294
+ start_idx = None
295
+ for i, s in enumerate(scores):
296
+ if s >= min_section_hits:
297
+ start_idx = i
298
+ break
299
+ if start_idx is None:
300
+ raise ValueError("Could not auto-detect contents/index pages.")
301
+
302
+ end_idx = start_idx
303
+ gap = 0
304
+ for i in range(start_idx + 1, len(scores)):
305
+ if scores[i] >= 1.0:
306
+ end_idx = i
307
+ gap = 0
308
+ else:
309
+ gap += 1
310
+ if gap >= consecutive_break:
311
+ break
312
+ return (start_idx + 1, end_idx + 1)
313
+
314
+ # ---------- OLLAMA CALLS & PARSERS ----------
315
+ import time, os, requests, json
316
+
317
+
318
+ def call_ollama(prompt: str) -> str:
319
+ try:
320
+ payload = {
321
+ "model": OPENROUTER_MODEL, # e.g. "meta-llama/llama-3.3-70b-instruct:free"
322
+ "messages": [
323
+ {"role": "user", "content": prompt}
324
+ ]
325
+ }
326
+ resp = requests.post(
327
+ OPENROUTER_API_URL,
328
+ headers=OPENROUTER_HEADERS,
329
+ json=payload,
330
+ timeout=120
331
+ )
332
+ resp.raise_for_status()
333
+ data = resp.json()
334
+ # OpenRouter chat completion shape
335
+ return data["choices"][0]["message"]["content"].strip()
336
+ except Exception as e:
337
+ return f"LOCAL_MODEL_ERROR: {str(e)}"
338
+
339
+ def summarize_text(text: str, model: str = MODEL, max_words: int = 200) -> str:
340
+ """
341
+ Basic fallback summarizer using the same LLM call function.
342
+ Used only when local BART summarizer fails or is unavailable.
343
+ """
344
+ if not text or not text.strip():
345
+ return ""
346
+
347
+ prompt = f"""
348
+ Summarize the following text clearly and concisely in no more than {max_words} words.
349
+ Do not omit key information.
350
+
351
+ TEXT:
352
+ {text}
353
+ """
354
+ try:
355
+ summary = call_ollama(prompt)
356
+ return summary.strip() if summary else ""
357
+ except Exception:
358
+ # worst-case fallback: truncate
359
+ return " ".join(text.split()[:max_words])
360
+
361
+
362
+ def generate_mcqs_ollama(topic: str, num_qs: int = 5, context: str = ""):
363
+ # Use textbook extract as the ONLY source
364
+ ctx = (context or "").strip()
365
+ if ctx:
366
+ # keep context size under control
367
+ ctx = ctx[:4000]
368
+ prompt = f"""
369
+ You are an exam question setter.
370
+
371
+ Use ONLY the following textbook extract as your source.
372
+ Do NOT use any outside knowledge.
373
+ Every question and option MUST be directly answerable from this text.
374
+
375
+ TEXTBOOK EXTRACT:
376
+ \"\"\"{ctx}\"\"\"
377
+
378
+ Topic: "{topic}"
379
+
380
+ Generate {num_qs} high-quality multiple-choice questions that are strictly based on the above extract.
381
+
382
+ STRICT FORMAT (do not add anything before or after this):
383
+
384
+ Q1. <question>
385
+ A) <option>
386
+ B) <option>
387
+ C) <option>
388
+ D) <option>
389
+ Answer: <A/B/C/D>
390
+ """
391
+ else:
392
+ # fallback if context somehow empty
393
+ prompt = f"""
394
+ Generate {num_qs} high-quality multiple-choice questions on: "{topic}"
395
+
396
+ STRICT FORMAT (do not break this):
397
+
398
+ Q1. <question>
399
+ A) <option>
400
+ B) <option>
401
+ C) <option>
402
+ D) <option>
403
+ Answer: <A/B/C/D>
404
+ """
405
+
406
+ out = call_ollama(prompt).strip()
407
+
408
+ if out.startswith("LOCAL_MODEL_ERROR") or not out:
409
+ return []
410
+
411
+ mcqs = []
412
+ blocks = re.split(r"Q\d+\.", out)[1:]
413
+
414
+ for block in blocks:
415
+ block = block.strip()
416
+ lines = [l.strip() for l in block.split("\n") if l.strip()]
417
+ if not lines:
418
+ continue
419
+
420
+ question = lines[0]
421
+
422
+ # extract A–D options
423
+ raw_options = [l for l in lines if re.match(r"^[A-D]\)", l)]
424
+
425
+ # don't duplicate labels: strip leading "A)"/"B)" etc
426
+ fixed_texts = []
427
+ for opt in raw_options:
428
+ fixed_texts.append(re.sub(r"^[A-D]\)\s*", "", opt).strip())
429
+
430
+ options = []
431
+ for i, text in enumerate(fixed_texts[:4]):
432
+ label = chr(ord("A") + i)
433
+ options.append(f"{label}) {text}")
434
+
435
+ ans = re.search(r"Answer:\s*([A-D])", block)
436
+ answer = ans.group(1) if ans else ""
437
+
438
+ if not question or len(options) < 4 or answer not in "ABCD":
439
+ continue
440
+
441
+ mcqs.append({
442
+ "question": question,
443
+ "options": options,
444
+ "answer": answer
445
+ })
446
+
447
+ if len(mcqs) == num_qs:
448
+ break
449
+
450
+ return mcqs
451
+
452
+ def generate_descriptive_with_answers(topic: str, num_qs: int = 3, context: str = ""):
453
+ ctx = (context or "").strip()
454
+ if ctx:
455
+ ctx = ctx[:4000]
456
+ prompt = f"""
457
+ You are an exam question setter.
458
+
459
+ Use ONLY the following textbook extract as your source.
460
+ Do NOT use any outside knowledge.
461
+ Every question and answer MUST be directly supported by this text.
462
+
463
+ TEXTBOOK EXTRACT:
464
+ \"\"\"{ctx}\"\"\"
465
+
466
+ Topic: "{topic}"
467
+
468
+ Generate {num_qs} descriptive / short-answer questions WITH answers.
469
+
470
+ STRICT FORMAT:
471
+
472
+ Q1. <question>
473
+ Answer: <answer>
474
+
475
+ NO extra text.
476
+ NO levels.
477
+ NO bullet points.
478
+ """
479
+ else:
480
+ prompt = f"""
481
+ Generate {num_qs} descriptive questions WITH answers about: "{topic}"
482
+
483
+ STRICT FORMAT:
484
+
485
+ Q1. <question>
486
+ Answer: <answer>
487
+
488
+ NO extra text.
489
+ NO levels.
490
+ NO bullet points.
491
+ """
492
+
493
+
494
+
495
+ out = call_ollama(prompt).strip()
496
+ if out.startswith("LOCAL_MODEL_ERROR") or not out:
497
+ return []
498
+
499
+ results = []
500
+ blocks = re.split(r"Q\d+\.", out)[1:]
501
+
502
+ for block in blocks:
503
+ block = block.strip()
504
+
505
+ q = block.split("\n")[0].strip()
506
+
507
+ ans = re.search(r"Answer:\s*(.*)", block, re.S)
508
+ answer = ans.group(1).strip() if ans else ""
509
+
510
+ if len(q) < 3 or len(answer) < 3:
511
+ continue
512
+
513
+ results.append({"question": q, "answer": answer})
514
+
515
+ if len(results) == num_qs:
516
+ break
517
+
518
+ return results
519
+
520
+
521
+
522
+ def build_docx_bytes(questions_data: dict) -> bytes:
523
+ doc = Document()
524
+ doc.add_heading("Generated Questions", level=1)
525
+ for topic_title, blocks in questions_data.items():
526
+ doc.add_heading(topic_title, level=2)
527
+ mcqs = blocks.get("mcqs", []) or []
528
+ if mcqs:
529
+ doc.add_paragraph("Multiple Choice Questions:")
530
+ for idx, mcq in enumerate(mcqs, start=1):
531
+ doc.add_paragraph(f"{idx}. {mcq.get('question','')}")
532
+ for opt in mcq.get("options", []):
533
+ doc.add_paragraph(f" {opt}")
534
+ ans = mcq.get("answer", "")
535
+ diff = mcq.get("difficulty", "N/A")
536
+ if ans:
537
+ doc.add_paragraph(f" Answer: {ans} Difficulty: {diff}")
538
+ else:
539
+ doc.add_paragraph(f" Difficulty: {diff}")
540
+ doc.add_paragraph("")
541
+ descrs = blocks.get("descriptive", []) or []
542
+ if descrs:
543
+ doc.add_paragraph("Descriptive / Short-answer Questions:")
544
+ for idx, dq in enumerate(descrs, start=1):
545
+ if isinstance(dq, dict):
546
+ q = dq.get("question", "")
547
+ a = dq.get("answer", "")
548
+ diff = dq.get("difficulty", "N/A")
549
+ else:
550
+ q = str(dq)
551
+ a, diff = "", "N/A"
552
+ doc.add_paragraph(f"{idx}. {q}")
553
+ if a:
554
+ doc.add_paragraph(f" Answer: {a}")
555
+ doc.add_paragraph(f" Difficulty: {diff}")
556
+ doc.add_paragraph("")
557
+ buf = BytesIO()
558
+ doc.save(buf)
559
+ buf.seek(0)
560
+ return buf.getvalue()
561
+
562
+ def build_dfs_from_questions(questions_data: dict):
563
+ rows = []
564
+ for topic_title, topic_data in questions_data.items():
565
+ for mcq in topic_data.get("mcqs", []):
566
+ opts = mcq.get("options") or []
567
+ rows.append({
568
+ "Topic": topic_title,
569
+ "Type": "MCQ",
570
+ "Question": mcq.get("question", ""),
571
+ "Option A": opts[0] if len(opts) > 0 else "",
572
+ "Option B": opts[1] if len(opts) > 1 else "",
573
+ "Option C": opts[2] if len(opts) > 2 else "",
574
+ "Option D": opts[3] if len(opts) > 3 else "",
575
+ "Answer": mcq.get("answer", ""),
576
+ "Difficulty": mcq.get("difficulty", "N/A"),
577
+ "Descriptive Answer": ""
578
+ })
579
+ for dq in topic_data.get("descriptive", []):
580
+ rows.append({
581
+ "Topic": topic_title,
582
+ "Type": "Descriptive",
583
+ "Question": dq.get("question", ""),
584
+ "Option A": "", "Option B": "", "Option C": "", "Option D": "",
585
+ "Answer": "",
586
+ "Difficulty": dq.get("difficulty", "N/A"),
587
+ "Descriptive Answer": dq.get("answer", "")
588
+ })
589
+ return pd.DataFrame(rows)
590
+
591
+ # ---------- ENDPOINTS: PDF / TOC / GENERATION ----------
592
+ @app.post("/extract_toc")
593
+ async def extract_toc(file: UploadFile = File(...)):
594
+ pdf_bytes = await file.read()
595
+ try:
596
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
597
+ # update page count state (not counting as upload until generation)
598
+ STATE["last_pdf_pages"] = getattr(doc, "page_count", 0)
599
+ # Try detect TOC pages and parse numeric headings
600
+ try:
601
+ start, end = detect_index_range(doc)
602
+ except Exception:
603
+ start, end = 1, min(6, doc.page_count)
604
+ text = "\n".join([doc.load_page(p-1).get_text("text") or "" for p in range(start, end+1)])
605
+ raw_matches = re.findall(r"(\d{1,2}\.\d+)\s+(.+?)\s+(\d{1,4})\b", text)
606
+ matches = []
607
+ if raw_matches:
608
+ for num, title, pno in raw_matches:
609
+ title_clean = re.sub(r"\.{2,}|\.{3,}", ".", title).strip(' .\t')
610
+ title_clean = clean_text(title_clean)
611
+ page_no = int(pno) if pno.isdigit() else None
612
+ matches.append({"subnum": num.strip(), "title": title_clean, "page": page_no})
613
+ else:
614
+ # fallback: search simple lines
615
+ for ln in text.splitlines():
616
+ m = re.match(r'^\s*(\d{1,2}\.\d+)\s+(.+?)\s+(\d{1,4})\s*$', ln)
617
+ if m:
618
+ matches.append({"subnum": m.group(1), "title": clean_text(m.group(2).strip()), "page": int(m.group(3))})
619
+ # Build chapters map
620
+ chapters = {}
621
+ for m in matches:
622
+ chap = int(m["subnum"].split(".")[0]) if m["subnum"].split(".")[0].isdigit() else 0
623
+ chapters.setdefault(chap, []).append(m)
624
+ return {"status": "success", "matches": matches, "chapters_count": len(chapters), "pages": STATE["last_pdf_pages"]}
625
+ except Exception as e:
626
+ return {"status": "error", "error": str(e)}
627
+
628
+ @app.post("/generate_pdf_mcqs")
629
+ async def generate_pdf_mcqs(
630
+ file: UploadFile = File(...),
631
+ chapters: str = Form("[]"),
632
+ question_type: str = Form("both"), # "mcq", "descriptive", or "both"
633
+ mcq_source: str = Form("llama_open"), # currently unused by backend, kept for future use
634
+ num_mcqs: int = Form(5), # Number of MCQs per topic
635
+ num_desc: int = Form(3) # Number of descriptive questions per topic
636
+ ):
637
+ pdf_bytes = await file.read()
638
+ selected_chapters = json.loads(chapters)
639
+ qtype = (question_type or "both").lower()
640
+
641
+ try:
642
+
643
+ md5 = hashlib.md5(pdf_bytes).hexdigest()
644
+ if STATE.get("last_pdf_hash") != md5:
645
+ STATE["pdf_uploads"] += 1
646
+ STATE["last_pdf_hash"] = md5
647
+
648
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
649
+ STATE["last_pdf_pages"] = getattr(doc, "page_count", 0)
650
+ full_text = "\n".join([doc.load_page(p).get_text("text") or "" for p in range(doc.page_count)])
651
+
652
+ try:
653
+ start, end = detect_index_range(doc)
654
+ index_text = "\n".join([doc.load_page(p-1).get_text("text") or "" for p in range(start, end+1)])
655
+ except Exception:
656
+ index_text = full_text[:4000]
657
+
658
+ raw_matches = re.findall(r"(\d{1,2}\.\d+)\s+(.+?)\s+(\d{1,4})\b", index_text)
659
+ topics = []
660
+ if raw_matches:
661
+ for num, title, pno in raw_matches:
662
+ title_clean = clean_text(re.sub(r"\.{2,}|\.{3,}", ".", title).strip(' .\t'))
663
+ page_no = int(pno) if pno.isdigit() else None
664
+ topics.append({"subnum": num, "title": title_clean, "page": page_no})
665
+ else:
666
+ for ln in index_text.splitlines():
667
+ m = re.match(r'^\s*(\d{1,2}\.\d+)\s+(.+)$', ln)
668
+ if m:
669
+ topics.append({"subnum": m.group(1), "title": clean_text(m.group(2).strip()), "page": None})
670
+
671
+ # Filter by selected chapters if provided
672
+ if selected_chapters:
673
+ filtered = []
674
+ for t in topics:
675
+ chap_no = int(t["subnum"].split(".")[0]) if t["subnum"].split(".")[0].isdigit() else 0
676
+ if chap_no in selected_chapters:
677
+ filtered.append(t)
678
+ topics = filtered
679
+
680
+ # Decide which types to produce
681
+ produce_mcq = (qtype in ("mcq", "both"))
682
+ produce_desc = (qtype in ("descriptive", "both"))
683
+
684
+ # Generate questions for each topic (only requested types)
685
+ results = {}
686
+ total_mcqs_generated = 0
687
+ total_desc_generated = 0
688
+
689
+ for t in topics:
690
+ title = t["title"]
691
+ if t.get("page"):
692
+ pg = t["page"]
693
+ startp = max(0, pg-2)
694
+ endp = min(doc.page_count, pg+1)
695
+ context = "\n".join([doc.load_page(p).get_text("text") or "" for p in range(startp, endp)])
696
+ else:
697
+ context = index_text[:2000]
698
+
699
+ entry = {}
700
+ if produce_mcq:
701
+ # Use the user-specified number of MCQs
702
+ entry["mcqs"] = generate_mcqs_ollama(title, num_qs=num_mcqs, context=context)
703
+ total_mcqs_generated += len(entry["mcqs"])
704
+ else:
705
+ entry["mcqs"] = []
706
+
707
+ if produce_desc:
708
+ # Use the user-specified number of descriptive questions
709
+ entry["descriptive"] = generate_descriptive_with_answers(title, num_qs=num_desc, context=context)
710
+ total_desc_generated += len(entry["descriptive"])
711
+ else:
712
+ entry["descriptive"] = []
713
+
714
+ results[title] = entry
715
+
716
+ # Save the generated questions to the database
717
+ save_questions_to_db(results)
718
+
719
+ # Build files and store them
720
+ df_all = build_dfs_from_questions(results)
721
+
722
+ # CSV
723
+ csv_bytes = df_all.to_csv(index=False).encode("utf-8")
724
+ csv_key = hashlib.md5(csv_bytes).hexdigest()
725
+ store_result_bytes(csv_key, csv_bytes, "questions.csv", "text/csv")
726
+
727
+ # Excel
728
+ excel_buf = BytesIO()
729
+ with pd.ExcelWriter(excel_buf, engine="xlsxwriter") as writer:
730
+ df_all.to_excel(writer, sheet_name="Questions", index=False)
731
+ excel_buf.seek(0)
732
+ excel_bytes = excel_buf.getvalue()
733
+ excel_key = hashlib.md5(excel_bytes).hexdigest()
734
+ store_result_bytes(excel_key, excel_bytes, "questions.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
735
+
736
+ # DOCX
737
+ docx_bytes = build_docx_bytes(results)
738
+ docx_key = hashlib.md5(docx_bytes).hexdigest()
739
+ store_result_bytes(docx_key, docx_bytes, "questions.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
740
+
741
+ # Update global state with exact counts
742
+ STATE["mcq_count"] = STATE.get("mcq_count", 0) + total_mcqs_generated
743
+ STATE["desc_count"] = STATE.get("desc_count", 0) + total_desc_generated
744
+
745
+ return {
746
+ "status": "success",
747
+ "results_count_topics": len(results),
748
+ "mcqCount": total_mcqs_generated, # Exact count of MCQs generated
749
+ "descCount": total_desc_generated, # Exact count of descriptive questions generated
750
+ "download_keys": {"csv": csv_key, "excel": excel_key, "docx": docx_key},
751
+ "pages": STATE["last_pdf_pages"],
752
+ "global_state": {
753
+ "pdf_uploads": STATE["pdf_uploads"],
754
+ "last_pdf_pages": STATE["last_pdf_pages"],
755
+ "mcq_count": STATE["mcq_count"],
756
+ "desc_count": STATE["desc_count"]
757
+ },
758
+ "results": results, # for immediate front-end rendering
759
+ "requested_mcqs_per_topic": num_mcqs, # For debugging
760
+ "requested_desc_per_topic": num_desc # For debugging
761
+ }
762
+
763
+
764
+
765
+ except Exception as e:
766
+ return {"status": "error", "error": str(e)}
767
+
768
+ @app.get("/questions")
769
+ def get_questions(search: str = None, qtype: str = None, flagged: bool = None):
770
+ db = SessionLocal()
771
+ try:
772
+ query = db.query(Question)
773
+
774
+ # Filter by flagged status if provided
775
+ if flagged is not None:
776
+ query = query.filter(Question.flagged == flagged)
777
+
778
+ if search:
779
+ search_term = f"%{search}%"
780
+ query = query.filter(
781
+ Question.question.ilike(search_term) |
782
+ Question.topic.ilike(search_term) |
783
+ Question.option_a.ilike(search_term) |
784
+ Question.option_b.ilike(search_term) |
785
+ Question.option_c.ilike(search_term) |
786
+ Question.option_d.ilike(search_term) |
787
+ Question.answer.ilike(search_term) |
788
+ Question.descriptive_answer.ilike(search_term)
789
+ )
790
+
791
+ # Filter by question type - FIX THIS PART
792
+ if qtype and qtype.lower() != 'all':
793
+ query = query.filter(Question.type == qtype)
794
+
795
+ questions = query.order_by(Question.created_at.desc()).all()
796
+
797
+ # Convert to dict for JSON serialization
798
+ result = []
799
+ for q in questions:
800
+ result.append({
801
+ "id": q.id,
802
+ "topic": q.topic,
803
+ "type": q.type,
804
+ "question": q.question,
805
+ "option_a": q.option_a,
806
+ "option_b": q.option_b,
807
+ "option_c": q.option_c,
808
+ "option_d": q.option_d,
809
+ "answer": q.answer,
810
+ "descriptive_answer": q.descriptive_answer,
811
+ "difficulty": q.difficulty,
812
+ "flagged": q.flagged,
813
+ "created_at": q.created_at.isoformat() if q.created_at else None
814
+ })
815
+
816
+ return result
817
+
818
+ except Exception as e:
819
+ return JSONResponse(content={"error": str(e)}, status_code=500)
820
+ finally:
821
+ db.close()
822
+
823
+ # Update the flag update function to handle individual question flagging
824
+ @app.post("/update_question_flag")
825
+ async def update_question_flag(question_data: dict):
826
+ """
827
+ Update the flagged status of a question
828
+ """
829
+ db = SessionLocal()
830
+ try:
831
+ question_id = question_data.get('id')
832
+ flagged = question_data.get('flagged')
833
+
834
+ if not question_id:
835
+ return {"status": "error", "error": "Question ID is required"}
836
+
837
+ question = db.query(Question).filter(Question.id == question_id).first()
838
+ if not question:
839
+ return {"status": "error", "error": "Question not found"}
840
+
841
+ # Convert to boolean to ensure consistent data type
842
+ question.flagged = flagged
843
+ db.commit()
844
+
845
+ return {
846
+ "status": "success",
847
+ "message": f"Question {question_id} flagged status updated to {flagged}",
848
+ "question_id": question_id,
849
+ "flagged": bool(flagged)
850
+ }
851
+
852
+ except Exception as e:
853
+ db.rollback()
854
+ return {"status": "error", "error": str(e)}
855
+ finally:
856
+ db.close()
857
+
858
+
859
+
860
+ @app.post("/save_questions_to_db")
861
+ async def save_questions_to_db_endpoint(data: dict):
862
+ try:
863
+ save_questions_to_db(data) # Calling the existing function to save questions to DB
864
+ return JSONResponse(content={"status": "success"})
865
+ except Exception as e:
866
+ return JSONResponse(content={"status": "error", "error": str(e)}, status_code=500)
867
+
868
+
869
+
870
+
871
+
872
+
873
+ import re
874
+ from random import sample
875
+
876
+
877
+
878
+ from sqlalchemy import or_, and_
879
+ @app.post("/generate_question_paper")
880
+ async def generate_question_paper(request_data: dict):
881
+ """
882
+ Generate a question paper with random questions based on the selected levels, types, and topics.
883
+ """
884
+ db = SessionLocal()
885
+ try:
886
+ # Extract parameters from request data
887
+ levels = request_data.get('levels', {})
888
+ types = request_data.get('types', {'mcq': True, 'descriptive': True})
889
+ topics = request_data.get('topics', 'all')
890
+
891
+ # Convert topics to list if it's a string
892
+ if topics == 'all':
893
+ selected_topics = []
894
+ else:
895
+ selected_topics = topics if isinstance(topics, list) else [topics]
896
+
897
+ # Build query filters
898
+ query_filters = []
899
+
900
+ # Filter by question type
901
+ type_filters = []
902
+ if types.get('mcq', True):
903
+ type_filters.append(Question.type == 'MCQ')
904
+ if types.get('descriptive', True):
905
+ type_filters.append(Question.type == 'Descriptive')
906
+
907
+ if type_filters:
908
+ query_filters.append(or_(*type_filters))
909
+
910
+ # Filter by topic if specific topics are selected
911
+ if selected_topics:
912
+ query_filters.append(Question.topic.in_(selected_topics))
913
+ # IMPORTANT: only approved questions
914
+ query_filters.append(Question.flagged == True)
915
+
916
+ # Apply filters to query
917
+ query = db.query(Question)
918
+ if query_filters:
919
+ query = query.filter(and_(*query_filters))
920
+
921
+ all_questions = query.all()
922
+
923
+ # Group questions by difficulty level
924
+ questions_by_level = {1: [], 2: [], 3: [], 4: [], 5: []}
925
+
926
+ for q in all_questions:
927
+ if q.difficulty and q.difficulty.isdigit():
928
+ level = int(q.difficulty)
929
+ if 1 <= level <= 5:
930
+ questions_by_level[level].append(q)
931
+
932
+ # Create a paper by selecting random questions from each level
933
+ question_paper = []
934
+ total_selected = 0
935
+ level_summary = {}
936
+
937
+ for level, count in levels.items():
938
+ level = int(level) # Ensure level is integer
939
+ if count > 0 and level in questions_by_level:
940
+ available_questions = questions_by_level[level]
941
+ if available_questions:
942
+ num_to_select = min(count, len(available_questions))
943
+ selected_questions = sample(available_questions, num_to_select)
944
+ question_paper.extend(selected_questions)
945
+ total_selected += num_to_select
946
+ level_summary[level] = num_to_select
947
+ else:
948
+ level_summary[level] = 0
949
+
950
+ # Return the selected question paper data
951
+ paper_data = []
952
+ for q in question_paper:
953
+ # Clean the options to remove answer and difficulty info
954
+ def clean_option(option_text):
955
+ if not option_text:
956
+ return option_text
957
+
958
+ # Remove "Answer: X Difficulty: Y" patterns from options
959
+ option_text = re.sub(r'\s*Answer:\s*[A-D]\s*Difficulty:\s*\d\s*$', '', option_text, flags=re.IGNORECASE)
960
+ option_text = re.sub(r'\s*Difficulty:\s*\d\s*Answer:\s*[A-D]\s*$', '', option_text, flags=re.IGNORECASE)
961
+
962
+ # Remove standalone patterns
963
+ option_text = re.sub(r'\s*Answer:\s*[A-D]\s*$', '', option_text, flags=re.IGNORECASE)
964
+ option_text = re.sub(r'\s*Difficulty:\s*\d\s*$', '', option_text, flags=re.IGNORECASE)
965
+
966
+ # Final cleanup
967
+ option_text = re.sub(r'[\.\s]*$', '', option_text).strip()
968
+ return option_text
969
+
970
+ # Add sanitized question to the result
971
+ question_dict = {
972
+ "id": q.id,
973
+ "topic": q.topic,
974
+ "type": q.type,
975
+ "question": q.question.strip(),
976
+ "option_a": clean_option(q.option_a),
977
+ "option_b": clean_option(q.option_b),
978
+ "option_c": clean_option(q.option_c),
979
+ "option_d": clean_option(q.option_d),
980
+ "flagged": q.flagged,
981
+ "difficulty": q.difficulty
982
+ }
983
+
984
+ paper_data.append(question_dict)
985
+
986
+ return {
987
+ "status": "success",
988
+ "questions": paper_data,
989
+ "total_selected": total_selected,
990
+ "level_summary": level_summary,
991
+ "filters_applied": {
992
+ "levels": levels,
993
+ "types": types,
994
+ "topics": selected_topics if selected_topics else "all"
995
+ },
996
+ "message": f"Generated paper with {total_selected} questions"
997
+ }
998
+
999
+ except Exception as e:
1000
+ return {"status": "error", "error": str(e)}
1001
+ finally:
1002
+ db.close()
1003
+
1004
+ @app.post("/update_question")
1005
+ async def update_question(question_data: dict):
1006
+ """
1007
+ Update any field of a question
1008
+ """
1009
+ db = SessionLocal()
1010
+ try:
1011
+ question_id = question_data.get('id')
1012
+ updates = question_data.get('updates', {})
1013
+
1014
+ if not question_id:
1015
+ return {"status": "error", "error": "Question ID is required"}
1016
+
1017
+ question = db.query(Question).filter(Question.id == question_id).first()
1018
+ if not question:
1019
+ return {"status": "error", "error": "Question not found"}
1020
+
1021
+ # Update allowed fields
1022
+ allowed_fields = ['topic', 'question', 'option_a', 'option_b', 'option_c', 'option_d',
1023
+ 'answer', 'descriptive_answer', 'difficulty', 'flagged']
1024
+
1025
+ for field, value in updates.items():
1026
+ if field in allowed_fields and hasattr(question, field):
1027
+ setattr(question, field, value)
1028
+
1029
+ db.commit()
1030
+
1031
+ return {
1032
+ "status": "success",
1033
+ "message": f"Question {question_id} updated successfully",
1034
+ "question_id": question_id,
1035
+ "updates": updates
1036
+ }
1037
+
1038
+ except Exception as e:
1039
+ db.rollback()
1040
+ return {"status": "error", "error": str(e)}
1041
+ finally:
1042
+ db.close()
1043
+
1044
+
1045
+
1046
+
1047
+ @app.post("/bulk_update_flags")
1048
+ async def bulk_update_flags(bulk_data: dict):
1049
+ """
1050
+ Update flagged status for multiple questions at once
1051
+ """
1052
+ db = SessionLocal()
1053
+ try:
1054
+ question_updates = bulk_data.get('question_updates', [])
1055
+
1056
+ if not question_updates:
1057
+ return {"status": "error", "error": "No question updates provided"}
1058
+
1059
+ updated_count = 0
1060
+ for update in question_updates:
1061
+ question_id = update.get('id')
1062
+ flagged = update.get('flagged')
1063
+
1064
+ if question_id is not None:
1065
+ question = db.query(Question).filter(Question.id == question_id).first()
1066
+ if question:
1067
+ question.flagged = flagged
1068
+ updated_count += 1
1069
+
1070
+ db.commit()
1071
+
1072
+ return {
1073
+ "status": "success",
1074
+ "message": f"Updated flagged status for {updated_count} questions",
1075
+ "updated_count": updated_count
1076
+ }
1077
+
1078
+ except Exception as e:
1079
+ db.rollback()
1080
+ return {"status": "error", "error": str(e)}
1081
+ finally:
1082
+ db.close()
1083
+
1084
+
1085
+
1086
+ import nltk
1087
+ from nltk.tokenize import sent_tokenize
1088
+ try:
1089
+ nltk.download('punkt', quiet=True)
1090
+ except Exception:
1091
+ pass
1092
+
1093
+ # optional libs flags
1094
+ try:
1095
+ import whisper
1096
+ _HAS_WHISPER = True
1097
+ except Exception:
1098
+ _HAS_WHISPER = False
1099
+
1100
+ try:
1101
+ from moviepy.editor import VideoFileClip
1102
+ _HAS_MOVIEPY = True
1103
+ except Exception:
1104
+ _HAS_MOVIEPY = False
1105
+
1106
+ # summarizer config (BART chunking)
1107
+ CHUNK_WORDS = 800
1108
+ SUMMARIZER_MODEL = "facebook/bart-large-cnn"
1109
+ SUMMARY_MIN_LENGTH = 30
1110
+
1111
+ # Local summarizer via transformers (optional, heavy)
1112
+ def split_transcript_into_chunks_by_words(transcript: str, chunk_words: int = CHUNK_WORDS):
1113
+ sentences = sent_tokenize(transcript)
1114
+ chunks, current, current_words = [], [], 0
1115
+ for s in sentences:
1116
+ wcount = len(s.split())
1117
+ if current_words + wcount > chunk_words and current:
1118
+ chunks.append(" ".join(current))
1119
+ current, current_words = [s], wcount
1120
+ else:
1121
+ current.append(s)
1122
+ current_words += wcount
1123
+ if current:
1124
+ chunks.append(" ".join(current))
1125
+ return chunks
1126
+
1127
+ def summarizer_pipeline(model_name=SUMMARIZER_MODEL):
1128
+ try:
1129
+ from transformers import pipeline
1130
+ return pipeline("summarization", model=model_name, device=-1) # CPU
1131
+ except Exception:
1132
+ return None
1133
+
1134
+ def summarize_chunks(chunks, summarizer):
1135
+ summaries = []
1136
+ for c in chunks:
1137
+ if summarizer:
1138
+ try:
1139
+ out = summarizer(c, max_length=400, min_length=100, do_sample=False)
1140
+ summary_text = out[0]['summary_text'].strip()
1141
+ except Exception:
1142
+ summary_text = " ".join(c.split()[:SUMMARY_MIN_LENGTH])
1143
+ else:
1144
+ # fallback: truncate
1145
+ summary_text = " ".join(c.split()[:SUMMARY_MIN_LENGTH])
1146
+ summaries.append(summary_text)
1147
+ return summaries
1148
+
1149
+ def combine_and_summarize_summaries(summaries):
1150
+ if not summaries:
1151
+ return ""
1152
+ return "\n\n".join(summaries)
1153
+
1154
+ def summarize_transcript_with_bart(transcript: str):
1155
+ """
1156
+ Try to summarize transcript using local BART in chunks; if local summarizer not available,
1157
+ return empty chunks and caller should fallback to Ollama summarizer with summarize_text().
1158
+ """
1159
+ if not transcript or not transcript.strip():
1160
+ return {"overall": "", "chunks": []}
1161
+ chunks = split_transcript_into_chunks_by_words(transcript, CHUNK_WORDS)
1162
+ summarizer = summarizer_pipeline(SUMMARIZER_MODEL)
1163
+ if summarizer is None:
1164
+ # signal to caller that local summarizer isn't available
1165
+ return {"overall": "", "chunks": []}
1166
+ chunk_summaries = summarize_chunks(chunks, summarizer)
1167
+ overall_summary = combine_and_summarize_summaries(chunk_summaries)
1168
+ return {"overall": overall_summary, "chunks": chunk_summaries}
1169
+
1170
+ # Robust MCQ parser (accepts many model output formats)
1171
+ def parse_mcqs_freeform(output: str) -> List[Dict]:
1172
+ mcqs = []
1173
+ if not output:
1174
+ return mcqs
1175
+ raw_lines = [ln.rstrip() for ln in output.splitlines() if ln.strip()]
1176
+ # drop very generic intro / header-only lines
1177
+ lines = []
1178
+ for ln in raw_lines:
1179
+ if re.search(r"(here are|multiple[-\s]?choice questions|based on the summary|based on the topic|following questions|the following)", ln, re.I):
1180
+ continue
1181
+ if re.match(r'^\s*(?:question|q)\s*\d+\b[:.\s-]*$', ln, re.I):
1182
+ continue
1183
+ lines.append(ln.strip())
1184
+
1185
+ i = 0
1186
+ while i < len(lines):
1187
+ ln = lines[i]
1188
+ # skip stray option lines until we find a question
1189
+ if re.match(r'^[A-D][\)\.\-:]\s+', ln, re.I):
1190
+ i += 1
1191
+ continue
1192
+ question_text = re.sub(r'^\s*(?:q|question)\s*\d+\s*[:.\-\)]*\s*', '', ln, flags=re.I).strip()
1193
+ if len(question_text) < 3:
1194
+ i += 1
1195
+ continue
1196
+ # collect options
1197
+ opts = []
1198
+ opt_map = {}
1199
+ j = i + 1
1200
+ while j < len(lines) and len(opts) < 4:
1201
+ if re.match(r'^[A-D][\)\.\-:]\s+', lines[j], re.I):
1202
+ m = re.match(r'^([A-D])[\)\.\-:]\s*(.*)$', lines[j], re.I)
1203
+ if m:
1204
+ label = m.group(1).upper()
1205
+ text = m.group(2).strip()
1206
+ formatted = f"{label}. {text}"
1207
+ opts.append(formatted)
1208
+ opt_map[label] = formatted
1209
+ else:
1210
+ opts.append(lines[j].strip())
1211
+ j += 1
1212
+ else:
1213
+ break
1214
+ # look ahead for Answer:
1215
+ answer = ""
1216
+ look_end = min(len(lines), j + 6)
1217
+ for k in range(j, look_end):
1218
+ candidate = lines[k].strip()
1219
+ m_ans = re.match(r'(?i)^\s*(?:answer|correct)[:\s\-]*\(?\s*([A-D])\s*\)?', candidate)
1220
+ if m_ans:
1221
+ answer = m_ans.group(1).upper()
1222
+ break
1223
+ m_single = re.match(r'^\s*([A-D])[\)\.\s]*$', candidate, re.I)
1224
+ if m_single:
1225
+ answer = m_single.group(1).upper()
1226
+ break
1227
+ if answer and answer not in opt_map:
1228
+ answer = "" # validate
1229
+ if question_text and len(opts) >= 2:
1230
+ mcqs.append({"question": question_text, "options": opts, "answer": answer})
1231
+ i = j if j > i else i + 1
1232
+ return mcqs
1233
+ # whisper-based transcription (uses whisper library, raises if not installed)
1234
+ def split_audio(audio_path: str, chunk_length_sec: int = 300):
1235
+ try:
1236
+ from pydub import AudioSegment
1237
+ except Exception:
1238
+ return [audio_path]
1239
+ import wave, contextlib
1240
+ with contextlib.closing(wave.open(audio_path, 'rb')) as wf:
1241
+ rate = wf.getframerate()
1242
+ n_frames = wf.getnframes()
1243
+ total_sec = n_frames / float(rate)
1244
+ if total_sec <= chunk_length_sec:
1245
+ return [audio_path]
1246
+ audio = AudioSegment.from_wav(audio_path)
1247
+ chunk_files = []
1248
+ for start_ms in range(0, len(audio), chunk_length_sec * 1000):
1249
+ chunk = audio[start_ms:start_ms + chunk_length_sec * 1000]
1250
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
1251
+ chunk.export(tmp.name, format="wav")
1252
+ chunk_files.append(tmp.name)
1253
+ return chunk_files
1254
+
1255
+ def transcribe_video_bytes(video_bytes: bytes, whisper_model_name: str = "small") -> str:
1256
+ if not _HAS_WHISPER or not _HAS_MOVIEPY:
1257
+ raise RuntimeError("Whisper or moviepy not available on server.")
1258
+ # write video to temp file
1259
+ vf = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
1260
+ vf.write(video_bytes); vf.flush(); vf.close()
1261
+ audio_path = None
1262
+ try:
1263
+ clip = VideoFileClip(vf.name)
1264
+ af = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
1265
+ audio_path = af.name
1266
+ clip.audio.write_audiofile(audio_path, logger=None)
1267
+ clip.close()
1268
+ chunk_files = split_audio(audio_path)
1269
+ model = whisper.load_model(whisper_model_name)
1270
+ full_text = ""
1271
+ for c in chunk_files:
1272
+ res = model.transcribe(c)
1273
+ text = res.get("text", "").strip()
1274
+ if text:
1275
+ full_text += text + " "
1276
+ try:
1277
+ if c != audio_path and os.path.exists(c):
1278
+ os.remove(c)
1279
+ except Exception:
1280
+ pass
1281
+ return full_text.strip()
1282
+ finally:
1283
+ try:
1284
+ if os.path.exists(vf.name): os.remove(vf.name)
1285
+ except Exception:
1286
+ pass
1287
+ try:
1288
+ if audio_path and os.path.exists(audio_path): os.remove(audio_path)
1289
+ except Exception:
1290
+ pass
1291
+
1292
+ # generate MCQs from summary (reuse existing function if present)
1293
+ def generate_mcqs_from_summary_local(summary: str, num_qs: int = 10, model: str = MODEL):
1294
+ # Reuse the same approach as your Streamlit function generate_mcqs_from_summary
1295
+ prompt = f"""
1296
+ Generate {num_qs} distinct multiple-choice questions that cover the following summary.
1297
+ For each question include:
1298
+ - Exactly 4 labeled options A) B) C) D)
1299
+ - A single-letter answer line like: Answer: <A/B/C/D>
1300
+
1301
+ Use exactly this format; do not add extra commentary or code fences.
1302
+
1303
+ Q1. <question text>
1304
+ A) <option A>
1305
+ B) <option B>
1306
+ C) <option C>
1307
+ D) <option D>
1308
+ Answer: <A/B/C/D>
1309
+
1310
+ Summary:
1311
+ {summary}
1312
+ """
1313
+ out = call_ollama(prompt, model=model, timeout=600)
1314
+ if out.startswith("OLLAMA_ERROR"):
1315
+ return [{"question": out, "options": [], "answer": ""}]
1316
+ return parse_mcqs_freeform(out)
1317
+
1318
+ # Endpoint: transcribe -> summarize (video)
1319
+ @app.post("/transcribe_video")
1320
+ async def transcribe_video(file: UploadFile = File(...), whisper_model: str = Form("small")):
1321
+ """
1322
+ Accepts a video file and returns transcript + summary.
1323
+ If local BART summarizer (transformers) is available it will be used; otherwise Ollama summarization used.
1324
+ """
1325
+ video_bytes = await file.read()
1326
+ try:
1327
+ # Transcribe (Whisper)
1328
+ if not _HAS_WHISPER or not _HAS_MOVIEPY:
1329
+ return {"status": "error", "error": "Transcription requires whisper and moviepy installed on server."}
1330
+ # update unique-video counter
1331
+ try:
1332
+ md5 = hashlib.md5(video_bytes).hexdigest()
1333
+ if STATE.get("last_video_hash") != md5:
1334
+ STATE["video_uploads"] = STATE.get("video_uploads", 0) + 1
1335
+ STATE["last_video_hash"] = md5
1336
+ except Exception:
1337
+ pass
1338
+ transcript = transcribe_video_bytes(video_bytes, whisper_model_name=whisper_model)
1339
+ # Try local BART summarizer first
1340
+ summ = summarize_transcript_with_bart(transcript)
1341
+ if not summ["overall"]:
1342
+ # fallback: use Ollama summarizer (summarize_text uses Ollama)
1343
+ overall = summarize_text(transcript, model=MODEL, max_words=200)
1344
+ return {"status": "success", "transcript": transcript, "summary": overall, "chunks": summ["chunks"]}
1345
+ return {"status": "success", "transcript": transcript, "summary": summ["overall"], "chunks": summ["chunks"],"global_state": {
1346
+ "video_uploads": STATE.get("video_uploads", 0),}}
1347
+ except Exception as e:
1348
+ return {"status": "error", "error": str(e)}
1349
+
1350
+ # Endpoint: generate MCQs (from summary or from video file)
1351
+ @app.post("/generate_video_mcqs")
1352
+ async def generate_video_mcqs(
1353
+ file: UploadFile = File(None),
1354
+ summary: str = Form(""),
1355
+ question_type: str = Form("both"), # "mcq", "descriptive", "both"
1356
+ num_qs: int = Form(10),
1357
+ whisper_model: str = Form("small")
1358
+ ):
1359
+ """
1360
+ Generate MCQs (and optionally descriptive questions) from a provided summary string,
1361
+ or from an uploaded video file (which will be transcribed & summarized).
1362
+ Returns per-request counts and download keys.
1363
+ """
1364
+ qtype = (question_type or "both").lower()
1365
+ summary_text = summary or ""
1366
+ try:
1367
+ # If file provided and summary empty, transcribe & summarize first
1368
+ if file is not None and not summary_text:
1369
+ if not _HAS_WHISPER or not _HAS_MOVIEPY:
1370
+ return {"status": "error", "error": "Transcription requires whisper and moviepy installed on server."}
1371
+ video_bytes = await file.read()
1372
+ transcript = transcribe_video_bytes(video_bytes, whisper_model_name=whisper_model)
1373
+ # try local BART
1374
+ summ = summarize_transcript_with_bart(transcript)
1375
+ if summ["overall"]:
1376
+ summary_text = summ["overall"]
1377
+ chunk_summaries = summ["chunks"]
1378
+ else:
1379
+ # fallback to Ollama
1380
+ summary_text = summarize_text(transcript, model=MODEL, max_words=200)
1381
+ chunk_summaries = summ["chunks"]
1382
+ elif summary_text:
1383
+ chunk_summaries = []
1384
+ else:
1385
+ return {"status": "error", "error": "No summary or file provided."}
1386
+
1387
+ produce_mcq = (qtype in ("mcq", "both"))
1388
+ produce_desc = (qtype in ("descriptive", "both"))
1389
+
1390
+ results = {}
1391
+ # We'll treat this as single topic "Video Summary"
1392
+ if produce_mcq:
1393
+ mcqs = generate_mcqs_from_summary_local(summary_text, num_qs=num_qs, model=MODEL)
1394
+ else:
1395
+ mcqs = []
1396
+ if produce_desc:
1397
+ descrs = generate_descriptive_with_answers("Video summary", context=summary_text, model=MODEL, num_qs=3)
1398
+ else:
1399
+ descrs = []
1400
+
1401
+ results["Video summary"] = {"mcqs": mcqs, "descriptive": descrs}
1402
+
1403
+ # Build files only containing the selected types
1404
+ df_all = build_dfs_from_questions(results)
1405
+
1406
+ # CSV
1407
+ csv_bytes = df_all.to_csv(index=False).encode("utf-8")
1408
+ csv_key = hashlib.md5(csv_bytes).hexdigest()
1409
+ store_result_bytes(csv_key, csv_bytes, "video_questions.csv", "text/csv")
1410
+
1411
+ # Excel
1412
+ excel_buf = BytesIO()
1413
+ with pd.ExcelWriter(excel_buf, engine="xlsxwriter") as writer:
1414
+ df_all.to_excel(writer, sheet_name="Questions", index=False)
1415
+ excel_buf.seek(0)
1416
+ excel_bytes = excel_buf.getvalue()
1417
+ excel_key = hashlib.md5(excel_bytes).hexdigest()
1418
+ store_result_bytes(excel_key, excel_bytes, "video_questions.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
1419
+
1420
+ # DOCX
1421
+ docx_bytes = build_docx_bytes(results)
1422
+ docx_key = hashlib.md5(docx_bytes).hexdigest()
1423
+ store_result_bytes(docx_key, docx_bytes, "video_questions.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
1424
+
1425
+ # counts for this request
1426
+ mcq_count_now = len(mcqs)
1427
+ desc_count_now = len(descrs)
1428
+
1429
+ # update global state
1430
+ STATE["mcq_count"] = STATE.get("mcq_count", 0) + mcq_count_now
1431
+ STATE["desc_count"] = STATE.get("desc_count", 0) + desc_count_now
1432
+
1433
+ return {
1434
+ "status": "success",
1435
+ "mcqCount": mcq_count_now,
1436
+ "descCount": desc_count_now,
1437
+ "download_keys": {"csv": csv_key, "excel": excel_key, "docx": docx_key},
1438
+ "global_state": {
1439
+ "pdf_uploads": STATE["pdf_uploads"],
1440
+ "last_pdf_pages": STATE["last_pdf_pages"],
1441
+ "mcq_count": STATE["mcq_count"],
1442
+ "desc_count": STATE["desc_count"]
1443
+ },
1444
+ "results": results,
1445
+ "summary": summary_text,
1446
+ "chunks": chunk_summaries
1447
+ }
1448
+ except Exception as e:
1449
+ return {"status": "error", "error": str(e)}
1450
+
1451
+
1452
+
1453
+
1454
+
1455
+
1456
+
1457
+
1458
+
1459
+
1460
+
1461
+
1462
+
1463
+
1464
+
1465
+
1466
+
1467
+
1468
+
1469
+
1470
+
1471
+
1472
+
1473
+
1474
+
1475
+
1476
+
1477
+
1478
+
1479
+
1480
+
1481
+
1482
+
1483
+
1484
+
1485
+
1486
+
1487
+
1488
+
1489
+
1490
+
1491
+
1492
+
1493
+
1494
+
1495
+
1496
+
1497
+
1498
+
1499
+
1500
+
1501
+
1502
+
1503
+
1504
+
1505
+
1506
+
1507
+
1508
+
1509
+
1510
+
1511
+
1512
+
1513
+
1514
+
1515
+
1516
+
1517
+
1518
+
1519
+
1520
+
1521
+
1522
+
1523
+
1524
+
1525
+
1526
+
1527
+
1528
+
1529
+
1530
+
1531
+
1532
+
1533
+
1534
+
1535
+
1536
+