rairo commited on
Commit
06fc015
·
verified ·
1 Parent(s): f716c87

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +496 -662
main.py CHANGED
@@ -4,6 +4,7 @@ import logging
4
  import re
5
  import time
6
  import threading
 
7
  import numpy as np
8
  import fitz # PyMuPDF
9
  from flask import Flask, request, jsonify
@@ -14,58 +15,83 @@ from sklearn.metrics.pairwise import cosine_similarity
14
  import firebase_admin
15
  from firebase_admin import credentials, db as firebase_db
16
 
17
- # --- CONFIGURATION ---
 
 
18
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
  logger = logging.getLogger(__name__)
20
 
21
- SYLLABI_DIR = "syllabi"
22
  PAST_EXAMS_DIR = "past_exams"
23
 
24
- # Google GenAI Config
25
- GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
26
  EMBEDDING_MODEL = "models/text-embedding-004"
27
-
28
- # --- GLOBAL STATE (IN-MEMORY CACHE) ---
29
- # Structure: { "A_9706": { "meta": {...}, "tree": [...] }, ... }
30
- SYLLABUS_MAP = {}
31
-
32
- # Structure: [ { "vector": [...], "meta": {...} } ]
33
- VECTOR_DB = []
34
- VECTOR_MATRIX = None # Numpy array for fast math
35
-
36
- # Past exam index: { "A_9706": [ { paperId, year, session, fileUrl, pages: [...] }, ... ] }
37
- EXAM_MAP = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  app = Flask(__name__)
40
  CORS(app)
41
 
42
- # -----------------------------------------------------------------------------
43
- # 0. FIREBASE INITIALIZATION
44
- # -----------------------------------------------------------------------------
45
-
46
- firebase_db_ref = None
47
 
48
  def init_firebase():
49
- global firebase_db_ref
50
  try:
51
- credentials_json_string = os.environ.get("FIREBASE")
52
- if not credentials_json_string:
53
- logger.warning("FIREBASE env var not set. Firebase caching disabled.")
54
- return False
55
-
56
- credentials_json = json.loads(credentials_json_string)
57
- firebase_db_url = os.environ.get("Firebase_DB")
58
-
59
- if not firebase_db_url:
60
- logger.warning("Firebase_DB env var not set. Firebase caching disabled.")
61
  return False
62
-
63
  if not firebase_admin._apps:
64
- cred = credentials.Certificate(credentials_json)
65
- firebase_admin.initialize_app(cred, {"databaseURL": firebase_db_url})
66
-
67
- firebase_db_ref = firebase_db.reference()
68
- logger.info("Firebase initialized successfully in Data API.")
69
  return True
70
  except Exception as e:
71
  logger.error(f"Firebase init failed: {e}")
@@ -73,190 +99,239 @@ def init_firebase():
73
 
74
  FIREBASE_AVAILABLE = init_firebase()
75
 
76
- def fb_set(path: str, data):
77
- """Write to Firebase, silently fail if unavailable."""
78
- if not FIREBASE_AVAILABLE or firebase_db_ref is None:
79
- return
80
- try:
81
- firebase_db_ref.child(path).set(data)
82
- except Exception as e:
83
- logger.error(f"Firebase write failed [{path}]: {e}")
84
 
85
- def fb_get(path: str):
86
- """Read from Firebase, return None if unavailable."""
87
- if not FIREBASE_AVAILABLE or firebase_db_ref is None:
88
- return None
89
- try:
90
- return firebase_db_ref.child(path).get()
91
  except Exception as e:
92
- logger.error(f"Firebase read failed [{path}]: {e}")
93
  return None
94
 
95
- # -----------------------------------------------------------------------------
96
- # 1. BOILERPLATE PAGE DETECTION
97
- # -----------------------------------------------------------------------------
98
-
99
- # Keywords that identify non-content pages to skip
100
- BOILERPLATE_TITLE_PATTERNS = re.compile(
101
- r'^\s*(about\s+(this\s+)?syllabus|foreword|acknowledgements?|introduction\s+to\s+(cambridge|zimsec)|'
102
- r'how\s+to\s+use\s+this\s+syllabus|why\s+choose\s+cambridge|support\s+for\s+teachers|'
103
- r'teacher\s+support|resource\s+list|list\s+of\s+resources|further\s+information|'
104
- r'copyright|legal\s+notice|syllabus\s+overview\s+at\s+a\s+glance|'
105
- r'assessment\s+at\s+a\s+glance|grade\s+descriptions|mathematical\s+notation|'
106
- r'command\s+words|glossary\s+of\s+command|changes\s+to\s+this\s+syllabus|'
107
- r'other\s+cambridge|university\s+of\s+cambridge|cambridge\s+assessment|'
108
- r'published\s+by|contents\s*$|table\s+of\s+contents)\s*$',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  re.IGNORECASE
110
  )
111
 
112
- # Keywords that signal content has actually started
113
- CONTENT_START_PATTERNS = re.compile(
114
- r'^\s*((syllabus\s+)?content|subject\s+content|unit\s+\d|topic\s+\d|'
115
- r'section\s+\d|module\s+\d|\d+\s+[A-Z]|component\s+\d|paper\s+\d|'
116
- r'scheme\s+of\s+work|learning\s+objectives|knowledge.*understanding)',
 
 
117
  re.IGNORECASE
118
  )
119
 
120
- def is_boilerplate_block(text: str) -> bool:
121
- """Returns True if this block is boilerplate/admin content to skip."""
122
- return bool(BOILERPLATE_TITLE_PATTERNS.match(text.strip()))
 
123
 
124
- def page_is_boilerplate(page_text: str) -> bool:
125
- """Returns True if the entire page appears to be admin/front-matter."""
126
- lines = [l.strip() for l in page_text.splitlines() if l.strip()]
127
- if not lines:
128
- return True
129
- # Check first substantive line
130
- first = lines[0]
131
- if BOILERPLATE_TITLE_PATTERNS.match(first):
132
- return True
133
- # Check if page is very short (< 5 lines) with no numbered items — likely a divider
134
- if len(lines) < 5 and not re.search(r'\d+\.\d+|\d+\s+[A-Z]', page_text):
135
- # Could be a section divider page not boilerplate but also empty
136
- pass
137
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- # -----------------------------------------------------------------------------
140
- # 2. THE PARSER ENGINE (Extracts Structure from PDF)
141
- # -----------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  class PDFParser:
144
  def __init__(self, filepath):
145
- self.filepath = filepath
146
- self.filename = os.path.basename(filepath)
147
- self.doc = fitz.open(filepath)
148
-
149
- parts = filepath.replace("\\", "/").split("/")
150
- self.level = parts[-2] if len(parts) > 1 else "General"
151
- self.subject_code = re.search(r'\d{4}', self.filename)
152
- self.subject_code = self.subject_code.group(0) if self.subject_code else "0000"
153
- self.subject_name = re.sub(r'[_\-]\d{4}.*', '', self.filename.replace('_', ' ')).strip()
154
- self.unique_id = f"{self.level}_{self.subject_code}"
155
-
156
- def get_font_characteristics(self):
157
- """Scans PDF to find the most common font size (body text)."""
158
- font_sizes = {}
 
159
  for page in self.doc:
160
- blocks = page.get_text("dict")["blocks"]
161
- for b in blocks:
162
  for l in b.get("lines", []):
163
  for s in l.get("spans", []):
164
- size = round(s["size"], 1)
165
- font_sizes[size] = font_sizes.get(size, 0) + len(s["text"])
166
- if not font_sizes:
167
- return 10.0
168
- return max(font_sizes, key=font_sizes.get)
169
-
170
- def _find_content_start_page(self) -> int:
171
- """
172
- Scans pages to find where actual syllabus content begins.
173
- Returns the 0-based page index.
174
- """
175
- for page_num, page in enumerate(self.doc):
176
- text = page.get_text("text")
177
- # Skip empty pages
178
- if len(text.strip()) < 30:
179
- continue
180
- # Skip boilerplate pages
181
- if page_is_boilerplate(text):
182
- continue
183
- # Look for numbered content sections
184
- if CONTENT_START_PATTERNS.search(text):
185
- logger.info(f" Content starts at page {page_num + 1} for {self.filename}")
186
- return page_num
187
- # Also check if this page has numbered topic headers (e.g. "1 Number" or "1.1 ...")
188
- if re.search(r'\n\s*\d+\.?\d*\s+[A-Z][a-z]', text):
189
- logger.info(f" Content (numbered) starts at page {page_num + 1} for {self.filename}")
190
- return page_num
191
-
192
- # Fallback: skip first 10% of pages (usually all front-matter)
193
- fallback = max(1, len(self.doc) // 10)
194
- logger.info(f" Using fallback content start page {fallback + 1} for {self.filename}")
195
- return fallback
196
 
197
  def parse(self):
198
- body_size = self.get_font_characteristics()
199
- content_start = self._find_content_start_page()
200
- logger.info(f"Parsing {self.filename} (Body size ~{body_size}pt, content from page {content_start + 1})")
201
 
202
- syllabus_tree = []
203
- current_topic = None
204
- current_subtopic = None
205
 
206
- topic_pattern = re.compile(r'^(\d+\.?\s|Key Question\s)', re.IGNORECASE)
 
 
207
 
208
  for page_num, page in enumerate(self.doc):
209
- # Skip pre-content pages entirely
210
- if page_num < content_start:
211
  continue
212
 
213
- blocks = page.get_text("dict")["blocks"]
214
- for b in blocks:
215
  block_text = ""
216
- max_size = 0
217
- is_bold = False
218
 
219
  for l in b.get("lines", []):
220
  for s in l.get("spans", []):
221
- text = s["text"].strip()
222
- if not text:
223
- continue
224
- block_text += text + " "
225
- if s["size"] > max_size:
226
- max_size = s["size"]
227
- if "bold" in s["font"].lower():
228
- is_bold = True
229
 
230
  block_text = block_text.strip()
231
- if len(block_text) < 3:
232
- continue
233
 
234
- # Skip boilerplate blocks even within content pages
235
- if is_boilerplate_block(block_text):
 
236
  continue
237
 
238
- # HEURISTIC 1: TOPIC (Large Header — 2pt+ above body)
239
  if max_size > body_size + 2:
240
  if current_subtopic and current_topic:
241
  current_topic["children"].append(current_subtopic)
242
  current_subtopic = None
243
  if current_topic:
244
  syllabus_tree.append(current_topic)
245
-
246
  current_topic = {
247
- "id": f"{self.unique_id}_{len(syllabus_tree)}",
248
- "title": block_text,
249
- "type": "topic",
250
  "children": []
251
  }
252
  current_subtopic = None
253
 
254
- # HEURISTIC 2: SUBTOPIC (Bold, numbered, or keyword-led)
255
  elif (is_bold and max_size >= body_size) or \
256
  (topic_pattern.match(block_text) and max_size >= body_size):
257
  if current_subtopic and current_topic:
258
  current_topic["children"].append(current_subtopic)
259
-
260
  if not current_topic:
261
  current_topic = {
262
  "id": f"{self.unique_id}_root",
@@ -264,27 +339,25 @@ class PDFParser:
264
  "type": "topic",
265
  "children": []
266
  }
267
-
268
  current_subtopic = {
269
- "id": f"{current_topic['id']}_{len(current_topic['children'])}",
270
- "title": block_text,
271
- "type": "subtopic",
272
  "content": []
273
  }
274
 
275
- # HEURISTIC 3: CONTENT (Body Text)
276
  elif max_size <= body_size + 1:
277
  if current_subtopic:
278
  current_subtopic["content"].append(block_text)
279
  elif current_topic:
280
  current_subtopic = {
281
- "id": f"{current_topic['id']}_intro",
282
- "title": "Overview",
283
- "type": "subtopic",
284
  "content": [block_text]
285
  }
286
 
287
- # Flush remainders
288
  if current_subtopic and current_topic:
289
  current_topic["children"].append(current_subtopic)
290
  if current_topic:
@@ -292,646 +365,407 @@ class PDFParser:
292
 
293
  return {
294
  "meta": {
295
- "id": self.unique_id,
296
- "subject": self.subject_name,
297
- "code": self.subject_code,
298
- "level": self.level,
299
- "filename": self.filename,
300
  "indexed_at": int(time.time())
301
  },
302
  "tree": syllabus_tree
303
  }
304
 
305
 
306
- # -----------------------------------------------------------------------------
307
- # 3. PAST EXAM PAPER PARSER
308
- # -----------------------------------------------------------------------------
309
 
310
  class ExamPaperParser:
311
- """
312
- Extracts metadata and full text from past exam PDFs.
313
- Expected naming: syllabi_code_year_session_paper.pdf
314
- E.g.: 9702_2023_May_Paper1.pdf or 9702_2023_s1.pdf
315
- Falls back to filename parsing when possible.
316
- """
317
-
318
  def __init__(self, filepath):
319
- self.filepath = filepath
320
- self.filename = os.path.basename(filepath)
321
- self.doc = fitz.open(filepath)
322
-
323
- parts = filepath.replace("\\", "/").split("/")
324
- self.level = parts[-2] if len(parts) > 1 else "General"
325
-
326
- # Parse subject code from filename
327
- code_match = re.search(r'\b(\d{4})\b', self.filename)
328
- self.subject_code = code_match.group(1) if code_match else "0000"
329
- self.unique_id = f"{self.level}_{self.subject_code}"
330
-
331
- # Parse year
332
- year_match = re.search(r'\b(20\d{2}|19\d{2})\b', self.filename)
333
- self.year = year_match.group(1) if year_match else "Unknown"
334
-
335
- # Parse session (May/June, Oct/Nov, etc.)
336
- session_match = re.search(
337
- r'(may[_\-]?june|oct[_\-]?nov|feb[_\-]?mar|summer|winter|s\d|w\d|m\d)',
338
- self.filename, re.IGNORECASE
339
- )
340
- self.session = session_match.group(1).upper() if session_match else "Unknown"
341
-
342
- # Parse paper number
343
- paper_match = re.search(r'[_\-]p(\d)|paper[\s_\-]?(\d)', self.filename, re.IGNORECASE)
344
- if paper_match:
345
- self.paper_num = paper_match.group(1) or paper_match.group(2)
346
- else:
347
- self.paper_num = "1"
348
-
349
- self.paper_id = f"{self.unique_id}_{self.year}_{self.session}_P{self.paper_num}"
350
 
351
  def extract_pages(self):
352
- """Extract text per page."""
353
- pages = []
354
- for i, page in enumerate(self.doc):
355
- text = page.get_text("text").strip()
356
- if text:
357
- pages.append({
358
- "page": i + 1,
359
- "text": text[:3000] # cap per page to avoid huge payloads
360
- })
361
- return pages
362
 
363
  def extract_questions(self):
364
- """
365
- Heuristic: questions usually start with a number followed by a period/bracket.
366
- E.g. "1." or "1 " or "(a)" at start of paragraph.
367
- Returns list of { number, text }.
368
- """
369
- questions = []
370
- full_text = "\n".join(p["text"] for p in self.extract_pages())
371
-
372
- # Split by question numbers
373
- q_pattern = re.compile(
374
- r'(?:^|\n)\s*(\d{1,2})\s*[\.\)]\s+(.+?)(?=\n\s*\d{1,2}\s*[\.\)]|\Z)',
375
- re.DOTALL | re.MULTILINE
376
- )
377
- for m in q_pattern.finditer(full_text):
378
- q_num = int(m.group(1))
379
- q_text = m.group(2).strip()
380
- if len(q_text) > 20: # filter noise
381
- questions.append({"number": q_num, "text": q_text[:2000]})
382
-
383
- return questions
384
 
385
  def parse(self):
386
- pages = self.extract_pages()
387
- questions = self.extract_questions()
388
-
389
  return {
390
  "meta": {
391
- "paperId": self.paper_id,
392
- "subjectId": self.unique_id,
393
  "subjectCode": self.subject_code,
394
- "level": self.level,
395
- "year": self.year,
396
- "session": self.session,
397
  "paperNumber": self.paper_num,
398
- "filename": self.filename,
399
- "totalPages": len(self.doc),
400
- "indexed_at": int(time.time())
401
  },
402
- "pages": pages,
403
- "questions": questions
404
  }
405
 
406
 
407
- # -----------------------------------------------------------------------------
408
- # 4. THE VECTOR ENGINE (Embeddings & Search)
409
- # -----------------------------------------------------------------------------
410
 
411
  def generate_embeddings(texts):
412
- """Generates embeddings using Gemini API."""
413
- if not GEMINI_API_KEY:
414
- logger.warning("No Gemini API Key. Using dummy vectors.")
415
  return [np.zeros(768).tolist() for _ in texts]
416
-
417
- client_g = genai.Client(api_key=GEMINI_API_KEY)
418
  results = []
419
- batch_size = 10
420
-
421
- for i in range(0, len(texts), batch_size):
422
- batch = texts[i:i + batch_size]
423
  try:
424
- resp = client_g.models.embed_content(
425
- model=EMBEDDING_MODEL,
426
- contents=batch,
427
- )
428
- for embedding in resp.embeddings:
429
- results.append(embedding.values)
430
  except Exception as e:
431
- logger.error(f"Embedding batch {i} failed: {e}")
432
  for _ in batch:
433
  results.append(np.zeros(768).tolist())
434
-
435
  return results
436
 
437
 
438
- # -----------------------------------------------------------------------------
439
- # 5. FIREBASE-BACKED INDEX BUILDER
440
- # -----------------------------------------------------------------------------
441
 
442
  def load_index_from_firebase():
443
- """
444
- Tries to load the full index from Firebase.
445
- Returns True if successfully loaded.
446
- """
447
  global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
448
-
449
- if not FIREBASE_AVAILABLE:
450
- return False
451
-
452
- logger.info("Attempting to load index from Firebase...")
453
-
454
  try:
455
- # Load syllabus map
456
  fb_syllabi = fb_get("data_api/syllabi")
457
- if not fb_syllabi:
458
- logger.info("No syllabus data in Firebase yet.")
459
- return False
460
-
461
  SYLLABUS_MAP = fb_syllabi
462
 
463
- # Load vector DB
464
  fb_vectors = fb_get("data_api/vectors")
465
- if not fb_vectors:
466
- logger.info("No vector data in Firebase yet.")
467
- return False
468
-
469
- VECTOR_DB = []
470
- valid_vectors = []
471
-
472
- for entry in fb_vectors.values() if isinstance(fb_vectors, dict) else fb_vectors:
473
- if not entry:
474
- continue
475
  vec = np.array(entry["vector"])
476
- VECTOR_DB.append({
477
- "vector": vec,
478
- "meta": entry["meta"]
479
- })
480
- valid_vectors.append(vec)
481
-
482
- if valid_vectors:
483
- VECTOR_MATRIX = np.vstack(valid_vectors)
484
 
485
- # Load exam map
486
  fb_exams = fb_get("data_api/exams")
487
  if fb_exams:
488
  EXAM_MAP = fb_exams
489
 
490
- logger.info(
491
- f"Loaded from Firebase: {len(SYLLABUS_MAP)} syllabi, "
492
- f"{len(VECTOR_DB)} vectors, {len(EXAM_MAP)} exam subjects."
493
- )
494
  return True
495
-
496
  except Exception as e:
497
- logger.error(f"Failed to load from Firebase: {e}")
498
  return False
499
 
 
 
500
 
501
- def save_syllabus_to_firebase(subject_id: str, data: dict):
502
- """Save a single syllabus entry to Firebase."""
503
- # Store tree without numpy arrays (just plain dicts)
504
- fb_set(f"data_api/syllabi/{subject_id}", data)
505
-
506
-
507
- def save_vectors_to_firebase(vector_entries: list):
508
- """Save vector entries to Firebase (store as lists, not numpy)."""
509
  fb_data = {}
510
- for i, entry in enumerate(vector_entries):
511
- key = f"v_{i:06d}"
512
- fb_data[key] = {
513
  "vector": entry["vector"].tolist() if isinstance(entry["vector"], np.ndarray) else entry["vector"],
514
- "meta": entry["meta"]
515
  }
516
  fb_set("data_api/vectors", fb_data)
517
 
 
 
 
518
 
519
- def save_exam_to_firebase(subject_id: str, paper_data: dict):
520
- """Save a parsed exam paper under the subject's exam list."""
521
- paper_id = paper_data["meta"]["paperId"]
522
- # Sanitize key
523
- safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
524
- fb_set(f"data_api/exams/{subject_id}/{safe_key}", paper_data)
525
 
 
 
 
526
 
527
  def build_index():
528
- """
529
- Walks directories, parses PDFs, builds JSON tree and Vector Index,
530
- then persists everything to Firebase.
531
- """
532
  global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
533
-
534
- logger.info("🚀 Starting Build Process...")
535
-
536
- # ---- SYLLABI ----
537
  parsed_data = []
538
 
539
  if os.path.exists(SYLLABI_DIR):
540
- for root, dirs, files in os.walk(SYLLABI_DIR):
541
- for file in sorted(files):
542
- if file.endswith(".pdf"):
543
- path = os.path.join(root, file)
544
- logger.info(f"Parsing syllabus: {path}")
545
- try:
546
- parser = PDFParser(path)
547
- data = parser.parse()
548
- parsed_data.append(data)
549
- SYLLABUS_MAP[data["meta"]["id"]] = data
550
- save_syllabus_to_firebase(data["meta"]["id"], data)
551
- except Exception as e:
552
- logger.error(f"Failed to parse {path}: {e}")
553
- else:
554
- logger.warning(f"Directory {SYLLABI_DIR} not found.")
555
 
556
- # ---- PAST EXAMS ----
557
  if os.path.exists(PAST_EXAMS_DIR):
558
- for root, dirs, files in os.walk(PAST_EXAMS_DIR):
559
- for file in sorted(files):
560
- if file.endswith(".pdf"):
561
- path = os.path.join(root, file)
562
- logger.info(f"Parsing exam paper: {path}")
563
- try:
564
- parser = ExamPaperParser(path)
565
- exam_data = parser.parse()
566
- subject_id = exam_data["meta"]["subjectId"]
567
-
568
- if subject_id not in EXAM_MAP:
569
- EXAM_MAP[subject_id] = {}
 
 
 
570
 
571
- paper_id = exam_data["meta"]["paperId"]
572
- safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
573
- EXAM_MAP[subject_id][safe_key] = exam_data
574
- save_exam_to_firebase(subject_id, exam_data)
575
- except Exception as e:
576
- logger.error(f"Failed to parse exam {path}: {e}")
577
- else:
578
- logger.info(f"No past_exams directory found at {PAST_EXAMS_DIR}. Skipping.")
579
-
580
- # ---- VECTORIZATION (syllabi only) ----
581
  if not parsed_data:
582
- logger.info("No new syllabus data to vectorize.")
583
  return
584
 
585
- chunks_to_embed = []
586
- chunk_metadata = []
587
-
588
  for item in parsed_data:
589
- meta_base = item["meta"]
590
  for topic in item["tree"]:
591
  for sub in topic.get("children", []):
592
- text_blob = "\n".join(sub.get("content", []))
593
- if len(text_blob) < 10:
594
- continue
595
-
596
- rich_text = (
597
- f"{meta_base['subject']} {meta_base['level']} "
598
- f"- {topic['title']} - {sub['title']}:\n{text_blob}"
599
- )
600
- chunks_to_embed.append(rich_text)
601
- chunk_metadata.append({
602
- "subject_id": meta_base["id"],
603
- "topic_id": topic["id"],
604
  "subtopic_id": sub["id"],
605
- "title": sub["title"],
606
- "content": text_blob
607
  })
608
 
609
- logger.info(f"🧮 Generating embeddings for {len(chunks_to_embed)} chunks...")
610
- vectors = generate_embeddings(chunks_to_embed)
611
-
612
  VECTOR_DB = []
613
- valid_vectors = []
614
-
615
- for i, vec in enumerate(vectors):
616
- np_vec = np.array(vec)
617
- VECTOR_DB.append({
618
- "vector": np_vec,
619
- "meta": chunk_metadata[i]
620
- })
621
- valid_vectors.append(np_vec)
622
-
623
- if valid_vectors:
624
- VECTOR_MATRIX = np.vstack(valid_vectors)
625
-
626
- # Persist to Firebase
627
- save_vectors_to_firebase(VECTOR_DB)
628
-
629
- logger.info(
630
- f"✅ Indexing Complete. "
631
- f"{len(SYLLABUS_MAP)} syllabi, {len(VECTOR_DB)} vectors, "
632
- f"{sum(len(v) for v in EXAM_MAP.values())} exam papers."
633
- )
634
-
 
 
 
 
 
 
 
 
 
 
 
635
 
636
- # -----------------------------------------------------------------------------
637
- # 6. DIRECTORY WATCHER — Auto-index new PDFs
638
- # -----------------------------------------------------------------------------
639
 
 
 
 
640
  _indexed_files = set()
641
 
642
- def _collect_existing_files():
643
- """Collect all currently-present PDFs to avoid re-indexing on boot."""
644
  for d in [SYLLABI_DIR, PAST_EXAMS_DIR]:
645
- if not os.path.exists(d):
646
- continue
647
  for root, _, files in os.walk(d):
648
  for f in files:
649
  if f.endswith(".pdf"):
650
  _indexed_files.add(os.path.join(root, f))
651
 
652
-
653
- def _watch_directories(interval=30):
654
- """Background thread: detect new PDFs and index them."""
655
  while True:
656
  time.sleep(interval)
657
  for directory, is_exam in [(SYLLABI_DIR, False), (PAST_EXAMS_DIR, True)]:
658
- if not os.path.exists(directory):
659
- continue
660
  for root, _, files in os.walk(directory):
661
- for file in files:
662
- if not file.endswith(".pdf"):
663
- continue
664
- path = os.path.join(root, file)
665
- if path in _indexed_files:
666
- continue
667
-
668
- logger.info(f"🆕 New PDF detected: {path}")
669
  _indexed_files.add(path)
670
-
671
  try:
672
  if is_exam:
673
- parser = ExamPaperParser(path)
674
  exam_data = parser.parse()
675
- subject_id = exam_data["meta"]["subjectId"]
676
-
677
- if subject_id not in EXAM_MAP:
678
- EXAM_MAP[subject_id] = {}
679
- paper_id = exam_data["meta"]["paperId"]
680
- safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
681
- EXAM_MAP[subject_id][safe_key] = exam_data
682
- save_exam_to_firebase(subject_id, exam_data)
683
  else:
684
  parser = PDFParser(path)
685
- data = parser.parse()
686
  SYLLABUS_MAP[data["meta"]["id"]] = data
687
- save_syllabus_to_firebase(data["meta"]["id"], data)
688
- # Re-vectorize just this document
689
  _incremental_vectorize(data)
690
-
691
  except Exception as e:
692
- logger.error(f"Error indexing new file {path}: {e}")
693
-
694
-
695
- def _incremental_vectorize(syllabus_data: dict):
696
- """Add vectors for a single newly-uploaded syllabus."""
697
- global VECTOR_DB, VECTOR_MATRIX
698
-
699
- meta_base = syllabus_data["meta"]
700
- chunks = []
701
- metas = []
702
-
703
- for topic in syllabus_data["tree"]:
704
- for sub in topic.get("children", []):
705
- text_blob = "\n".join(sub.get("content", []))
706
- if len(text_blob) < 10:
707
- continue
708
- rich_text = (
709
- f"{meta_base['subject']} {meta_base['level']} "
710
- f"- {topic['title']} - {sub['title']}:\n{text_blob}"
711
- )
712
- chunks.append(rich_text)
713
- metas.append({
714
- "subject_id": meta_base["id"],
715
- "topic_id": topic["id"],
716
- "subtopic_id": sub["id"],
717
- "title": sub["title"],
718
- "content": text_blob
719
- })
720
 
721
- if not chunks:
722
- return
723
-
724
- vectors = generate_embeddings(chunks)
725
 
726
- for i, vec in enumerate(vectors):
727
- np_vec = np.array(vec)
728
- VECTOR_DB.append({"vector": np_vec, "meta": metas[i]})
729
-
730
- if VECTOR_DB:
731
- VECTOR_MATRIX = np.vstack([e["vector"] for e in VECTOR_DB])
732
-
733
- # Persist full updated vector set
734
- save_vectors_to_firebase(VECTOR_DB)
735
- logger.info(f"Incremental vectorize complete for {meta_base['id']}.")
736
-
737
-
738
- # -----------------------------------------------------------------------------
739
- # 7. API ENDPOINTS
740
- # -----------------------------------------------------------------------------
741
 
742
  @app.route('/health', methods=['GET'])
743
  def health():
744
  return jsonify({
745
- "status": "online",
746
  "subjects_loaded": list(SYLLABUS_MAP.keys()),
747
- "vector_chunks": len(VECTOR_DB),
748
- "exam_subjects": list(EXAM_MAP.keys()),
749
- "firebase": FIREBASE_AVAILABLE
 
 
750
  })
751
 
 
 
 
 
 
 
 
 
 
 
 
752
 
753
  @app.route('/v1/structure/<subject_id>', methods=['GET'])
754
  def get_structure(subject_id):
755
- """Returns the static JSON tree for navigation UI."""
756
  data = SYLLABUS_MAP.get(subject_id)
757
  if not data:
758
  return jsonify({"error": "Subject not found"}), 404
759
  return jsonify(data)
760
 
761
-
762
- @app.route('/v1/subjects', methods=['GET'])
763
- def list_subjects():
764
- """Returns metadata for all indexed syllabi."""
765
- result = []
766
- for sid, data in SYLLABUS_MAP.items():
767
- result.append(data.get("meta", {"id": sid}))
768
- return jsonify(result)
769
-
770
-
771
  @app.route('/v1/search', methods=['POST'])
772
  def search():
773
- """
774
- Semantic Retrieval.
775
- Input: { "query": "...", "filter_subject_id": "..." (optional) }
776
- """
777
- if VECTOR_MATRIX is None or len(VECTOR_DB) == 0:
778
  return jsonify({"error": "Index not ready"}), 503
779
-
780
- data = request.json or {}
781
- query = data.get("query")
782
- subject_filter = data.get("filter_subject_id")
783
-
784
- if not query:
785
  return jsonify({"error": "Query required"}), 400
786
-
787
- if not GEMINI_API_KEY:
788
  return jsonify({"error": "Embedding API not configured"}), 503
789
-
790
- client_g = genai.Client(api_key=GEMINI_API_KEY)
791
  try:
792
- resp = client_g.models.embed_content(model=EMBEDDING_MODEL, contents=query)
793
- query_vec = np.array(resp.embeddings[0].values).reshape(1, -1)
794
  except Exception as e:
795
  return jsonify({"error": str(e)}), 500
796
-
797
- scores = cosine_similarity(query_vec, VECTOR_MATRIX)[0]
798
- top_indices = np.argsort(scores)[::-1]
799
-
800
  results = []
801
- count = 0
802
- for idx in top_indices:
803
- if scores[idx] < 0.3:
804
- break
805
- entry = VECTOR_DB[idx]
806
- meta = entry["meta"]
807
-
808
- if subject_filter and meta["subject_id"] != subject_filter:
809
- continue
810
-
811
- results.append({
812
- "score": float(scores[idx]),
813
- "subject_id": meta["subject_id"],
814
- "title": meta["title"],
815
- "content": meta["content"],
816
- "node_id": meta["subtopic_id"]
817
- })
818
-
819
- count += 1
820
- if count >= 5:
821
- break
822
-
823
  return jsonify({"results": results})
824
 
825
-
826
  @app.route('/v1/exams', methods=['GET'])
827
  def list_exams():
828
- """
829
- List past exam papers.
830
- Query param: subject_id (optional)
831
- """
832
- subject_id = request.args.get("subject_id")
833
-
834
- if subject_id:
835
- papers = EXAM_MAP.get(subject_id, {})
836
- result = [p["meta"] for p in papers.values() if isinstance(p, dict) and "meta" in p]
837
- else:
838
- result = []
839
- for sid, papers in EXAM_MAP.items():
840
- for p in papers.values():
841
- if isinstance(p, dict) and "meta" in p:
842
- result.append(p["meta"])
843
-
844
- return jsonify(result)
845
-
846
 
847
  @app.route('/v1/exams/<paper_id>', methods=['GET'])
848
  def get_exam(paper_id):
849
- """
850
- Get full exam paper (pages + questions).
851
- paper_id format: A_9702_2023_MAY_P1
852
- """
853
- safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
854
-
855
- for sid, papers in EXAM_MAP.items():
856
  for key, paper in papers.items():
857
- if key == safe_key or (isinstance(paper, dict) and
858
- paper.get("meta", {}).get("paperId") == paper_id):
859
  return jsonify(paper)
860
-
861
- return jsonify({"error": "Exam paper not found"}), 404
862
-
863
 
864
  @app.route('/v1/exams/<paper_id>/questions', methods=['GET'])
865
  def get_exam_questions(paper_id):
866
- """Get just the extracted questions from a past paper."""
867
- safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
868
-
869
- for sid, papers in EXAM_MAP.items():
870
  for key, paper in papers.items():
871
- if key == safe_key or (isinstance(paper, dict) and
872
- paper.get("meta", {}).get("paperId") == paper_id):
873
- return jsonify({
874
- "paperId": paper_id,
875
- "meta": paper.get("meta"),
876
- "questions": paper.get("questions", [])
877
- })
878
-
879
- return jsonify({"error": "Exam paper not found"}), 404
880
-
881
 
882
  @app.route('/v1/rebuild', methods=['POST'])
883
  def trigger_rebuild():
884
- """
885
- Trigger a full index rebuild (admin use).
886
- Optionally pass { "force": true } to bypass Firebase cache.
887
- """
888
- auth_header = request.headers.get("Authorization", "")
889
- rebuild_key = os.environ.get("REBUILD_SECRET", "")
890
- if rebuild_key and auth_header != f"Bearer {rebuild_key}":
891
  return jsonify({"error": "Unauthorized"}), 401
892
-
893
- def _rebuild_bg():
894
  global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
895
- SYLLABUS_MAP = {}
896
- VECTOR_DB = []
897
- VECTOR_MATRIX = None
898
- EXAM_MAP = {}
899
  build_index()
900
-
901
- t = threading.Thread(target=_rebuild_bg, daemon=True)
902
- t.start()
903
  return jsonify({"status": "rebuild started"}), 202
904
 
905
 
906
- # -----------------------------------------------------------------------------
907
- # 8. STARTUP BOOTSTRAP
908
- # -----------------------------------------------------------------------------
909
 
910
  def start_app():
911
- # Create directories if needed
912
  for d in [SYLLABI_DIR, PAST_EXAMS_DIR]:
913
  if not os.path.exists(d):
914
  os.makedirs(os.path.join(d, "A"), exist_ok=True)
915
  os.makedirs(os.path.join(d, "O"), exist_ok=True)
916
- logger.info(f"Created empty directory: {d}")
917
-
918
- # Try to load from Firebase first
919
- loaded = load_index_from_firebase()
920
-
921
- if not loaded:
922
- # Build from scratch
923
  build_index()
924
  else:
925
- logger.info("Served from Firebase cache. Skipping full rebuild.")
926
-
927
- # Collect existing files so the watcher doesn't re-index them
928
- _collect_existing_files()
929
-
930
- # Start background watcher for new uploads
931
- watcher = threading.Thread(target=_watch_directories, daemon=True)
932
- watcher.start()
933
- logger.info("Directory watcher started.")
934
-
935
 
936
  with app.app_context():
937
  start_app()
 
4
  import re
5
  import time
6
  import threading
7
+ import base64
8
  import numpy as np
9
  import fitz # PyMuPDF
10
  from flask import Flask, request, jsonify
 
15
  import firebase_admin
16
  from firebase_admin import credentials, db as firebase_db
17
 
18
+ # ---------------------------------------------------------------------------
19
+ # CONFIGURATION
20
+ # ---------------------------------------------------------------------------
21
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
22
  logger = logging.getLogger(__name__)
23
 
24
+ SYLLABI_DIR = "syllabi"
25
  PAST_EXAMS_DIR = "past_exams"
26
 
27
+ GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
 
28
  EMBEDDING_MODEL = "models/text-embedding-004"
29
+ VISION_MODEL = "gemini-2.5-flash"
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # COMPLETE SUBJECT REGISTRY (all 24 PDFs on HuggingFace)
33
+ # ---------------------------------------------------------------------------
34
+ A_LEVEL_SUBJECTS = {
35
+ "A_9706": "Accounting",
36
+ "A_9700": "Biology",
37
+ "A_9609": "Business",
38
+ "A_9701": "Chemistry",
39
+ "A_9618": "Computer Science",
40
+ "A_9708": "Economics",
41
+ "A_9231": "Further Mathematics",
42
+ "A_9489": "History",
43
+ "A_9695": "Literature in English",
44
+ "A_9709": "Mathematics",
45
+ "A_9702": "Physics",
46
+ "A_9699": "Sociology",
47
+ "A_9395": "Travel and Tourism",
48
+ }
49
+ O_LEVEL_SUBJECTS = {
50
+ "O_0452": "Accounting",
51
+ "O_0610": "Biology",
52
+ "O_0450": "Business Studies",
53
+ "O_0620": "Chemistry",
54
+ "O_0478": "Computer Science",
55
+ "O_0500": "English Language",
56
+ "O_0475": "English Literature",
57
+ "O_0680": "Environmental Management",
58
+ "O_0460": "Geography",
59
+ "O_0470": "History",
60
+ "O_0625": "Physics",
61
+ }
62
+ ALL_SUBJECTS = {**A_LEVEL_SUBJECTS, **O_LEVEL_SUBJECTS}
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # GLOBAL STATE
66
+ # ---------------------------------------------------------------------------
67
+ SYLLABUS_MAP = {}
68
+ VECTOR_DB = []
69
+ VECTOR_MATRIX = None
70
+ EXAM_MAP = {}
71
 
72
  app = Flask(__name__)
73
  CORS(app)
74
 
75
+ # ---------------------------------------------------------------------------
76
+ # FIREBASE
77
+ # ---------------------------------------------------------------------------
78
+ firebase_db_ref = None
79
+ FIREBASE_AVAILABLE = False
80
 
81
  def init_firebase():
82
+ global firebase_db_ref, FIREBASE_AVAILABLE
83
  try:
84
+ creds_str = os.environ.get("FIREBASE")
85
+ db_url = os.environ.get("Firebase_DB")
86
+ if not creds_str or not db_url:
87
+ logger.warning("Firebase env vars missing.")
 
 
 
 
 
 
88
  return False
 
89
  if not firebase_admin._apps:
90
+ cred = credentials.Certificate(json.loads(creds_str))
91
+ firebase_admin.initialize_app(cred, {"databaseURL": db_url})
92
+ firebase_db_ref = firebase_db.reference()
93
+ FIREBASE_AVAILABLE = True
94
+ logger.info("Firebase initialised (Data API).")
95
  return True
96
  except Exception as e:
97
  logger.error(f"Firebase init failed: {e}")
 
99
 
100
  FIREBASE_AVAILABLE = init_firebase()
101
 
102
+ def fb_set(path, data):
103
+ if not FIREBASE_AVAILABLE: return
104
+ try: firebase_db_ref.child(path).set(data)
105
+ except Exception as e: logger.error(f"FB write [{path}]: {e}")
 
 
 
 
106
 
107
+ def fb_get(path):
108
+ if not FIREBASE_AVAILABLE: return None
109
+ try: return firebase_db_ref.child(path).get()
 
 
 
110
  except Exception as e:
111
+ logger.error(f"FB read [{path}]: {e}")
112
  return None
113
 
114
+ # ---------------------------------------------------------------------------
115
+ # GEMINI CLIENT
116
+ # ---------------------------------------------------------------------------
117
+ _gemini_client = None
118
+
119
+ def get_gemini():
120
+ global _gemini_client
121
+ if _gemini_client is None and GEMINI_API_KEY:
122
+ _gemini_client = genai.Client(api_key=GEMINI_API_KEY)
123
+ return _gemini_client
124
+
125
+ # ---------------------------------------------------------------------------
126
+ # VISION-BASED PAGE CLASSIFIER
127
+ # Renders each page as an image and asks Gemini to classify it.
128
+ # Falls back to heuristic if vision call fails or key is absent.
129
+ # ---------------------------------------------------------------------------
130
+
131
+ DEFINITE_BOILERPLATE_RE = re.compile(
132
+ r'^\s*(about\s+this\s+syllabus|foreword|acknowledgements?|'
133
+ r'why\s+choose\s+(cambridge|zimsec|this\s+syllabus)|cambridge\s+learner|'
134
+ r'key\s+benefits?|how\s+to\s+use\s+this\s+syllabus|'
135
+ r'support\s+for\s+(cambridge|teachers)|resource\s+list|'
136
+ r'further\s+information|copyright|legal\s+notice|'
137
+ r'changes\s+to\s+this\s+syllabus|university\s+of\s+cambridge|'
138
+ r'cambridge\s+assessment\s+international|published\s+by|'
139
+ r'contents?\s*$|table\s+of\s+contents?|'
140
+ r'assessment\s+at\s+a\s+glance|syllabus\s+at\s+a\s+glance|'
141
+ r'grade\s+descriptions?|command\s+words|glossary\s+of\s+command|'
142
+ r'mathematical\s+notation|other\s+cambridge\s+qualifications|'
143
+ r'how\s+to\s+offer|progression|post[-\s]?qualification|'
144
+ r'school\s+supported\s+candidate|cambridge\s+primary|cambridge\s+lower\s+secondary)\s*$',
145
  re.IGNORECASE
146
  )
147
 
148
+ CONTENT_START_RE = re.compile(
149
+ r'(^|\n)\s*(\d+\.?\d*\s+[A-Z][a-z]|\d+\s+[A-Z][a-z]|'
150
+ r'subject\s+content|'
151
+ r'unit\s+\d|topic\s+\d|section\s+\d|module\s+\d|'
152
+ r'component\s+\d|paper\s+\d|'
153
+ r'learning\s+objectives|knowledge\s+and\s+understanding|'
154
+ r'candidates\s+should\s+be\s+able)',
155
  re.IGNORECASE
156
  )
157
 
158
+ def _page_to_base64_png(page, dpi=72) -> str:
159
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
160
+ pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
161
+ return base64.b64encode(pix.tobytes("png")).decode("utf-8")
162
 
163
+ def _vision_classify_page(page, page_num: int, subject_name: str) -> str:
164
+ """Returns 'boilerplate', 'content', or 'uncertain'."""
165
+ client = get_gemini()
166
+ if client is None:
167
+ return "uncertain"
168
+ try:
169
+ b64 = _page_to_base64_png(page)
170
+ prompt = (
171
+ f"This is page {page_num + 1} of a Cambridge International AS & A Level / "
172
+ f"IGCSE syllabus for {subject_name}.\n\n"
173
+ "Classify this page as ONE of:\n"
174
+ "BOILERPLATE - administrative or introductory content: foreword, about this "
175
+ "syllabus, why choose Cambridge, key benefits, Cambridge learner attributes, "
176
+ "how to use this syllabus, table of contents, copyright, assessment overview "
177
+ "tables, grade descriptions, command words, mathematical notation appendix, "
178
+ "support information, changes to syllabus, qualification overview.\n"
179
+ "CONTENT - actual subject matter students must learn: topic lists, learning "
180
+ "objectives, numbered content sections, subject-specific knowledge points, "
181
+ "skills, practical work descriptions, candidate assessment criteria.\n\n"
182
+ "Reply with exactly one word: BOILERPLATE or CONTENT"
183
+ )
184
+ resp = client.models.generate_content(
185
+ model=VISION_MODEL,
186
+ contents=[{"role": "user", "parts": [
187
+ {"inline_data": {"mime_type": "image/png", "data": b64}},
188
+ {"text": prompt}
189
+ ]}]
190
+ )
191
+ answer = (resp.text or "").strip().upper()
192
+ if "BOILERPLATE" in answer: return "boilerplate"
193
+ if "CONTENT" in answer: return "content"
194
+ return "uncertain"
195
+ except Exception as e:
196
+ logger.warning(f"Vision classify page {page_num}: {e}")
197
+ return "uncertain"
198
+
199
+ def classify_all_pages(doc, subject_name: str) -> list:
200
+ """
201
+ Returns list of 'boilerplate' or 'content' for each page.
202
+ Uses vision for first 40 pages, heuristic after that.
203
+ Caches result to avoid re-classifying on incremental runs.
204
+ """
205
+ classifications = []
206
+ n = len(doc)
207
 
208
+ for i, page in enumerate(doc):
209
+ text = page.get_text("text").strip()
210
+
211
+ # Pages beyond the front-matter zone are almost always content
212
+ if i >= 40:
213
+ classifications.append("content")
214
+ continue
215
+
216
+ # Hard-rule catch
217
+ first_lines = [l.strip() for l in text.splitlines() if l.strip()][:3]
218
+ if first_lines and DEFINITE_BOILERPLATE_RE.match(first_lines[0]):
219
+ classifications.append("boilerplate")
220
+ continue
221
+
222
+ # Empty page
223
+ if len(text) < 30:
224
+ classifications.append("boilerplate")
225
+ continue
226
+
227
+ # Vision call
228
+ verdict = _vision_classify_page(page, i, subject_name)
229
+ if verdict == "uncertain":
230
+ verdict = "content" if CONTENT_START_RE.search(text) else "boilerplate"
231
+ classifications.append(verdict)
232
+ logger.info(f" Page {i+1}/{n}: {verdict}")
233
+
234
+ # Safety: if vision misclassified everything as boilerplate, use heuristic fallback
235
+ if not any(c == "content" for c in classifications):
236
+ logger.warning(f" All pages BOILERPLATE for {subject_name} — applying heuristic fallback.")
237
+ classifications = []
238
+ found_content = False
239
+ for i, page in enumerate(doc):
240
+ text = page.get_text("text")
241
+ if not found_content and CONTENT_START_RE.search(text):
242
+ found_content = True
243
+ classifications.append("content" if found_content else "boilerplate")
244
+
245
+ return classifications
246
+
247
+
248
+ # ---------------------------------------------------------------------------
249
+ # PDF PARSER — Vision-enhanced
250
+ # ---------------------------------------------------------------------------
251
 
252
  class PDFParser:
253
  def __init__(self, filepath):
254
+ self.filepath = filepath
255
+ self.filename = os.path.basename(filepath)
256
+ self.doc = fitz.open(filepath)
257
+ parts = filepath.replace("\\", "/").split("/")
258
+ self.level = parts[-2] if len(parts) > 1 else "General"
259
+ code_m = re.search(r'\d{4}', self.filename)
260
+ self.subject_code = code_m.group(0) if code_m else "0000"
261
+ self.unique_id = f"{self.level}_{self.subject_code}"
262
+ self.subject_name = ALL_SUBJECTS.get(
263
+ self.unique_id,
264
+ re.sub(r'[_\-]\d{4}.*', '', self.filename.replace('_', ' ')).strip()
265
+ )
266
+
267
+ def get_body_font_size(self):
268
+ sizes = {}
269
  for page in self.doc:
270
+ for b in page.get_text("dict")["blocks"]:
 
271
  for l in b.get("lines", []):
272
  for s in l.get("spans", []):
273
+ sz = round(s["size"], 1)
274
+ sizes[sz] = sizes.get(sz, 0) + len(s["text"])
275
+ return max(sizes, key=sizes.get) if sizes else 10.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
  def parse(self):
278
+ body_size = self.get_body_font_size()
279
+ page_classes = classify_all_pages(self.doc, self.subject_name)
280
+ topic_pattern = re.compile(r'^(\d+\.?\s|Key\s+Question\s)', re.IGNORECASE)
281
 
282
+ logger.info(f"Parsing content of {self.filename} (body ~{body_size}pt)")
283
+ content_page_count = sum(1 for c in page_classes if c == "content")
284
+ logger.info(f" {content_page_count} content pages out of {len(self.doc)} total")
285
 
286
+ syllabus_tree = []
287
+ current_topic = None
288
+ current_subtopic = None
289
 
290
  for page_num, page in enumerate(self.doc):
291
+ if page_classes[page_num] == "boilerplate":
 
292
  continue
293
 
294
+ for b in page.get_text("dict")["blocks"]:
 
295
  block_text = ""
296
+ max_size = 0
297
+ is_bold = False
298
 
299
  for l in b.get("lines", []):
300
  for s in l.get("spans", []):
301
+ t = s["text"].strip()
302
+ if not t: continue
303
+ block_text += t + " "
304
+ if s["size"] > max_size: max_size = s["size"]
305
+ if "bold" in s["font"].lower(): is_bold = True
 
 
 
306
 
307
  block_text = block_text.strip()
308
+ if len(block_text) < 3: continue
 
309
 
310
+ # Skip residual boilerplate blocks within content pages
311
+ first_words = " ".join(block_text.split()[:6])
312
+ if DEFINITE_BOILERPLATE_RE.match(first_words):
313
  continue
314
 
315
+ # TOPIC
316
  if max_size > body_size + 2:
317
  if current_subtopic and current_topic:
318
  current_topic["children"].append(current_subtopic)
319
  current_subtopic = None
320
  if current_topic:
321
  syllabus_tree.append(current_topic)
 
322
  current_topic = {
323
+ "id": f"{self.unique_id}_{len(syllabus_tree)}",
324
+ "title": block_text,
325
+ "type": "topic",
326
  "children": []
327
  }
328
  current_subtopic = None
329
 
330
+ # SUBTOPIC
331
  elif (is_bold and max_size >= body_size) or \
332
  (topic_pattern.match(block_text) and max_size >= body_size):
333
  if current_subtopic and current_topic:
334
  current_topic["children"].append(current_subtopic)
 
335
  if not current_topic:
336
  current_topic = {
337
  "id": f"{self.unique_id}_root",
 
339
  "type": "topic",
340
  "children": []
341
  }
 
342
  current_subtopic = {
343
+ "id": f"{current_topic['id']}_{len(current_topic['children'])}",
344
+ "title": block_text,
345
+ "type": "subtopic",
346
  "content": []
347
  }
348
 
349
+ # BODY
350
  elif max_size <= body_size + 1:
351
  if current_subtopic:
352
  current_subtopic["content"].append(block_text)
353
  elif current_topic:
354
  current_subtopic = {
355
+ "id": f"{current_topic['id']}_intro",
356
+ "title": "Overview",
357
+ "type": "subtopic",
358
  "content": [block_text]
359
  }
360
 
 
361
  if current_subtopic and current_topic:
362
  current_topic["children"].append(current_subtopic)
363
  if current_topic:
 
365
 
366
  return {
367
  "meta": {
368
+ "id": self.unique_id,
369
+ "subject": self.subject_name,
370
+ "code": self.subject_code,
371
+ "level": self.level,
372
+ "filename": self.filename,
373
  "indexed_at": int(time.time())
374
  },
375
  "tree": syllabus_tree
376
  }
377
 
378
 
379
+ # ---------------------------------------------------------------------------
380
+ # PAST EXAM PARSER
381
+ # ---------------------------------------------------------------------------
382
 
383
  class ExamPaperParser:
 
 
 
 
 
 
 
384
  def __init__(self, filepath):
385
+ self.filepath = filepath
386
+ self.filename = os.path.basename(filepath)
387
+ self.doc = fitz.open(filepath)
388
+ parts = filepath.replace("\\", "/").split("/")
389
+ self.level = parts[-2] if len(parts) > 1 else "General"
390
+ code_m = re.search(r'\b(\d{4})\b', self.filename)
391
+ self.subject_code = code_m.group(1) if code_m else "0000"
392
+ self.unique_id = f"{self.level}_{self.subject_code}"
393
+ year_m = re.search(r'\b(20\d{2}|19\d{2})\b', self.filename)
394
+ self.year = year_m.group(1) if year_m else "Unknown"
395
+ sess_m = re.search(r'(may[_\-]?june|oct[_\-]?nov|feb[_\-]?mar|summer|winter|s\d|w\d|m\d)', self.filename, re.IGNORECASE)
396
+ self.session = sess_m.group(1).upper() if sess_m else "Unknown"
397
+ paper_m = re.search(r'[_\-]p(\d)|paper[\s_\-]?(\d)', self.filename, re.IGNORECASE)
398
+ self.paper_num = (paper_m.group(1) or paper_m.group(2)) if paper_m else "1"
399
+ self.paper_id = f"{self.unique_id}_{self.year}_{self.session}_P{self.paper_num}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
  def extract_pages(self):
402
+ return [{"page": i + 1, "text": p.get_text("text").strip()[:3000]}
403
+ for i, p in enumerate(self.doc) if p.get_text("text").strip()]
 
 
 
 
 
 
 
 
404
 
405
  def extract_questions(self):
406
+ full = "\n".join(p["text"] for p in self.extract_pages())
407
+ pat = re.compile(r'(?:^|\n)\s*(\d{1,2})\s*[\.\)]\s+(.+?)(?=\n\s*\d{1,2}\s*[\.\)]|\Z)', re.DOTALL | re.MULTILINE)
408
+ return [{"number": int(m.group(1)), "text": m.group(2).strip()[:2000]}
409
+ for m in pat.finditer(full) if len(m.group(2).strip()) > 20]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
  def parse(self):
 
 
 
412
  return {
413
  "meta": {
414
+ "paperId": self.paper_id,
415
+ "subjectId": self.unique_id,
416
  "subjectCode": self.subject_code,
417
+ "level": self.level,
418
+ "year": self.year,
419
+ "session": self.session,
420
  "paperNumber": self.paper_num,
421
+ "filename": self.filename,
422
+ "totalPages": len(self.doc),
423
+ "indexed_at": int(time.time())
424
  },
425
+ "pages": self.extract_pages(),
426
+ "questions": self.extract_questions()
427
  }
428
 
429
 
430
+ # ---------------------------------------------------------------------------
431
+ # EMBEDDINGS
432
+ # ---------------------------------------------------------------------------
433
 
434
  def generate_embeddings(texts):
435
+ client = get_gemini()
436
+ if client is None:
 
437
  return [np.zeros(768).tolist() for _ in texts]
 
 
438
  results = []
439
+ for i in range(0, len(texts), 10):
440
+ batch = texts[i:i + 10]
 
 
441
  try:
442
+ resp = client.models.embed_content(model=EMBEDDING_MODEL, contents=batch)
443
+ for emb in resp.embeddings:
444
+ results.append(emb.values)
 
 
 
445
  except Exception as e:
446
+ logger.error(f"Embed batch {i}: {e}")
447
  for _ in batch:
448
  results.append(np.zeros(768).tolist())
 
449
  return results
450
 
451
 
452
+ # ---------------------------------------------------------------------------
453
+ # FIREBASE PERSISTENCE
454
+ # ---------------------------------------------------------------------------
455
 
456
  def load_index_from_firebase():
 
 
 
 
457
  global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
458
+ if not FIREBASE_AVAILABLE: return False
459
+ logger.info("Loading index from Firebase ...")
 
 
 
 
460
  try:
 
461
  fb_syllabi = fb_get("data_api/syllabi")
462
+ if not fb_syllabi: return False
 
 
 
463
  SYLLABUS_MAP = fb_syllabi
464
 
 
465
  fb_vectors = fb_get("data_api/vectors")
466
+ if not fb_vectors: return False
467
+ VECTOR_DB = []
468
+ valid = []
469
+ for entry in (fb_vectors.values() if isinstance(fb_vectors, dict) else fb_vectors):
470
+ if not entry: continue
 
 
 
 
 
471
  vec = np.array(entry["vector"])
472
+ VECTOR_DB.append({"vector": vec, "meta": entry["meta"]})
473
+ valid.append(vec)
474
+ if valid:
475
+ VECTOR_MATRIX = np.vstack(valid)
 
 
 
 
476
 
 
477
  fb_exams = fb_get("data_api/exams")
478
  if fb_exams:
479
  EXAM_MAP = fb_exams
480
 
481
+ logger.info(f"Loaded: {len(SYLLABUS_MAP)} syllabi, {len(VECTOR_DB)} vectors, {len(EXAM_MAP)} exam subjects.")
 
 
 
482
  return True
 
483
  except Exception as e:
484
+ logger.error(f"Firebase load: {e}")
485
  return False
486
 
487
+ def save_syllabus(sid, data):
488
+ fb_set(f"data_api/syllabi/{sid}", data)
489
 
490
+ def save_all_vectors():
 
 
 
 
 
 
 
491
  fb_data = {}
492
+ for i, entry in enumerate(VECTOR_DB):
493
+ fb_data[f"v_{i:06d}"] = {
 
494
  "vector": entry["vector"].tolist() if isinstance(entry["vector"], np.ndarray) else entry["vector"],
495
+ "meta": entry["meta"]
496
  }
497
  fb_set("data_api/vectors", fb_data)
498
 
499
+ def save_exam(sid, exam_data):
500
+ safe = re.sub(r'[.\[\]#$/]', '_', exam_data["meta"]["paperId"])
501
+ fb_set(f"data_api/exams/{sid}/{safe}", exam_data)
502
 
 
 
 
 
 
 
503
 
504
+ # ---------------------------------------------------------------------------
505
+ # INDEX BUILDER
506
+ # ---------------------------------------------------------------------------
507
 
508
  def build_index():
 
 
 
 
509
  global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
510
+ logger.info("Full index build starting ...")
 
 
 
511
  parsed_data = []
512
 
513
  if os.path.exists(SYLLABI_DIR):
514
+ for root, _, files in os.walk(SYLLABI_DIR):
515
+ for f in sorted(files):
516
+ if not f.endswith(".pdf"): continue
517
+ path = os.path.join(root, f)
518
+ logger.info(f"Syllabus: {path}")
519
+ try:
520
+ parser = PDFParser(path)
521
+ data = parser.parse()
522
+ parsed_data.append(data)
523
+ SYLLABUS_MAP[data["meta"]["id"]] = data
524
+ save_syllabus(data["meta"]["id"], data)
525
+ except Exception as e:
526
+ logger.error(f"{path}: {e}")
 
 
527
 
 
528
  if os.path.exists(PAST_EXAMS_DIR):
529
+ for root, _, files in os.walk(PAST_EXAMS_DIR):
530
+ for f in sorted(files):
531
+ if not f.endswith(".pdf"): continue
532
+ path = os.path.join(root, f)
533
+ logger.info(f"Exam: {path}")
534
+ try:
535
+ parser = ExamPaperParser(path)
536
+ exam_data = parser.parse()
537
+ sid = exam_data["meta"]["subjectId"]
538
+ if sid not in EXAM_MAP: EXAM_MAP[sid] = {}
539
+ safe = re.sub(r'[.\[\]#$/]', '_', exam_data["meta"]["paperId"])
540
+ EXAM_MAP[sid][safe] = exam_data
541
+ save_exam(sid, exam_data)
542
+ except Exception as e:
543
+ logger.error(f"{path}: {e}")
544
 
 
 
 
 
 
 
 
 
 
 
545
  if not parsed_data:
546
+ logger.info("Nothing to vectorize.")
547
  return
548
 
549
+ chunks, metas = [], []
 
 
550
  for item in parsed_data:
551
+ mb = item["meta"]
552
  for topic in item["tree"]:
553
  for sub in topic.get("children", []):
554
+ blob = "\n".join(sub.get("content", []))
555
+ if len(blob) < 10: continue
556
+ chunks.append(f"{mb['subject']} {mb['level']} - {topic['title']} - {sub['title']}:\n{blob}")
557
+ metas.append({
558
+ "subject_id": mb["id"],
559
+ "topic_id": topic["id"],
 
 
 
 
 
 
560
  "subtopic_id": sub["id"],
561
+ "title": sub["title"],
562
+ "content": blob
563
  })
564
 
565
+ logger.info(f"Embedding {len(chunks)} chunks ...")
566
+ vecs = generate_embeddings(chunks)
 
567
  VECTOR_DB = []
568
+ valid = []
569
+ for i, v in enumerate(vecs):
570
+ nv = np.array(v)
571
+ VECTOR_DB.append({"vector": nv, "meta": metas[i]})
572
+ valid.append(nv)
573
+ if valid:
574
+ VECTOR_MATRIX = np.vstack(valid)
575
+ save_all_vectors()
576
+ logger.info(f"Index done: {len(SYLLABUS_MAP)} syllabi, {len(VECTOR_DB)} vectors.")
577
+
578
+
579
+ def _incremental_vectorize(syllabus_data):
580
+ global VECTOR_DB, VECTOR_MATRIX
581
+ mb = syllabus_data["meta"]
582
+ chunks, metas = [], []
583
+ for topic in syllabus_data["tree"]:
584
+ for sub in topic.get("children", []):
585
+ blob = "\n".join(sub.get("content", []))
586
+ if len(blob) < 10: continue
587
+ chunks.append(f"{mb['subject']} {mb['level']} - {topic['title']} - {sub['title']}:\n{blob}")
588
+ metas.append({
589
+ "subject_id": mb["id"],
590
+ "topic_id": topic["id"],
591
+ "subtopic_id": sub["id"],
592
+ "title": sub["title"],
593
+ "content": blob
594
+ })
595
+ if not chunks: return
596
+ for i, v in enumerate(generate_embeddings(chunks)):
597
+ VECTOR_DB.append({"vector": np.array(v), "meta": metas[i]})
598
+ if VECTOR_DB:
599
+ VECTOR_MATRIX = np.vstack([e["vector"] for e in VECTOR_DB])
600
+ save_all_vectors()
601
 
 
 
 
602
 
603
+ # ---------------------------------------------------------------------------
604
+ # WATCHER
605
+ # ---------------------------------------------------------------------------
606
  _indexed_files = set()
607
 
608
+ def _collect_existing():
 
609
  for d in [SYLLABI_DIR, PAST_EXAMS_DIR]:
610
+ if not os.path.exists(d): continue
 
611
  for root, _, files in os.walk(d):
612
  for f in files:
613
  if f.endswith(".pdf"):
614
  _indexed_files.add(os.path.join(root, f))
615
 
616
+ def _watch(interval=30):
 
 
617
  while True:
618
  time.sleep(interval)
619
  for directory, is_exam in [(SYLLABI_DIR, False), (PAST_EXAMS_DIR, True)]:
620
+ if not os.path.exists(directory): continue
 
621
  for root, _, files in os.walk(directory):
622
+ for f in files:
623
+ if not f.endswith(".pdf"): continue
624
+ path = os.path.join(root, f)
625
+ if path in _indexed_files: continue
 
 
 
 
626
  _indexed_files.add(path)
627
+ logger.info(f"New PDF: {path}")
628
  try:
629
  if is_exam:
630
+ parser = ExamPaperParser(path)
631
  exam_data = parser.parse()
632
+ sid = exam_data["meta"]["subjectId"]
633
+ if sid not in EXAM_MAP: EXAM_MAP[sid] = {}
634
+ safe = re.sub(r'[.\[\]#$/]', '_', exam_data["meta"]["paperId"])
635
+ EXAM_MAP[sid][safe] = exam_data
636
+ save_exam(sid, exam_data)
 
 
 
637
  else:
638
  parser = PDFParser(path)
639
+ data = parser.parse()
640
  SYLLABUS_MAP[data["meta"]["id"]] = data
641
+ save_syllabus(data["meta"]["id"], data)
 
642
  _incremental_vectorize(data)
 
643
  except Exception as e:
644
+ logger.error(f"Watch {path}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
 
 
 
 
 
646
 
647
+ # ---------------------------------------------------------------------------
648
+ # API
649
+ # ---------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
650
 
651
  @app.route('/health', methods=['GET'])
652
  def health():
653
  return jsonify({
654
+ "status": "online",
655
  "subjects_loaded": list(SYLLABUS_MAP.keys()),
656
+ "subject_count": len(SYLLABUS_MAP),
657
+ "vector_chunks": len(VECTOR_DB),
658
+ "exam_subjects": list(EXAM_MAP.keys()),
659
+ "firebase": FIREBASE_AVAILABLE,
660
+ "registered_subjects": ALL_SUBJECTS
661
  })
662
 
663
+ @app.route('/v1/subjects', methods=['GET'])
664
+ def list_subjects():
665
+ result = []
666
+ for sid, data in SYLLABUS_MAP.items():
667
+ result.append({**data.get("meta", {"id": sid}), "indexed": True})
668
+ for uid, name in ALL_SUBJECTS.items():
669
+ if uid not in SYLLABUS_MAP:
670
+ level = "A" if uid.startswith("A_") else "O"
671
+ result.append({"id": uid, "subject": name, "code": uid.split("_")[1],
672
+ "level": level, "indexed": False})
673
+ return jsonify(result)
674
 
675
  @app.route('/v1/structure/<subject_id>', methods=['GET'])
676
  def get_structure(subject_id):
 
677
  data = SYLLABUS_MAP.get(subject_id)
678
  if not data:
679
  return jsonify({"error": "Subject not found"}), 404
680
  return jsonify(data)
681
 
 
 
 
 
 
 
 
 
 
 
682
  @app.route('/v1/search', methods=['POST'])
683
  def search():
684
+ if VECTOR_MATRIX is None or not VECTOR_DB:
 
 
 
 
685
  return jsonify({"error": "Index not ready"}), 503
686
+ req = request.json or {}
687
+ q = req.get("query")
688
+ sf = req.get("filter_subject_id")
689
+ if not q:
 
 
690
  return jsonify({"error": "Query required"}), 400
691
+ c = get_gemini()
692
+ if c is None:
693
  return jsonify({"error": "Embedding API not configured"}), 503
 
 
694
  try:
695
+ resp = c.models.embed_content(model=EMBEDDING_MODEL, contents=q)
696
+ qv = np.array(resp.embeddings[0].values).reshape(1, -1)
697
  except Exception as e:
698
  return jsonify({"error": str(e)}), 500
699
+ scores = cosine_similarity(qv, VECTOR_MATRIX)[0]
 
 
 
700
  results = []
701
+ for idx in np.argsort(scores)[::-1]:
702
+ if scores[idx] < 0.3: break
703
+ meta = VECTOR_DB[idx]["meta"]
704
+ if sf and meta["subject_id"] != sf: continue
705
+ results.append({"score": float(scores[idx]), "subject_id": meta["subject_id"],
706
+ "title": meta["title"], "content": meta["content"],
707
+ "node_id": meta["subtopic_id"]})
708
+ if len(results) >= 5: break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  return jsonify({"results": results})
710
 
 
711
  @app.route('/v1/exams', methods=['GET'])
712
  def list_exams():
713
+ sid = request.args.get("subject_id")
714
+ out = []
715
+ for s, papers in EXAM_MAP.items():
716
+ if sid and s != sid: continue
717
+ for p in papers.values():
718
+ if isinstance(p, dict) and "meta" in p:
719
+ out.append(p["meta"])
720
+ return jsonify(out)
 
 
 
 
 
 
 
 
 
 
721
 
722
  @app.route('/v1/exams/<paper_id>', methods=['GET'])
723
  def get_exam(paper_id):
724
+ safe = re.sub(r'[.\[\]#$/]', '_', paper_id)
725
+ for _, papers in EXAM_MAP.items():
 
 
 
 
 
726
  for key, paper in papers.items():
727
+ if key == safe or (isinstance(paper, dict) and paper.get("meta", {}).get("paperId") == paper_id):
 
728
  return jsonify(paper)
729
+ return jsonify({"error": "Not found"}), 404
 
 
730
 
731
  @app.route('/v1/exams/<paper_id>/questions', methods=['GET'])
732
  def get_exam_questions(paper_id):
733
+ safe = re.sub(r'[.\[\]#$/]', '_', paper_id)
734
+ for _, papers in EXAM_MAP.items():
 
 
735
  for key, paper in papers.items():
736
+ if key == safe or (isinstance(paper, dict) and paper.get("meta", {}).get("paperId") == paper_id):
737
+ return jsonify({"paperId": paper_id, "meta": paper.get("meta"), "questions": paper.get("questions", [])})
738
+ return jsonify({"error": "Not found"}), 404
 
 
 
 
 
 
 
739
 
740
  @app.route('/v1/rebuild', methods=['POST'])
741
  def trigger_rebuild():
742
+ secret = os.environ.get("REBUILD_SECRET", "")
743
+ if secret and request.headers.get("Authorization", "") != f"Bearer {secret}":
 
 
 
 
 
744
  return jsonify({"error": "Unauthorized"}), 401
745
+ def _bg():
 
746
  global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
747
+ SYLLABUS_MAP = {}; VECTOR_DB = []; VECTOR_MATRIX = None; EXAM_MAP = {}
 
 
 
748
  build_index()
749
+ threading.Thread(target=_bg, daemon=True).start()
 
 
750
  return jsonify({"status": "rebuild started"}), 202
751
 
752
 
753
+ # ---------------------------------------------------------------------------
754
+ # STARTUP
755
+ # ---------------------------------------------------------------------------
756
 
757
  def start_app():
 
758
  for d in [SYLLABI_DIR, PAST_EXAMS_DIR]:
759
  if not os.path.exists(d):
760
  os.makedirs(os.path.join(d, "A"), exist_ok=True)
761
  os.makedirs(os.path.join(d, "O"), exist_ok=True)
762
+ if not load_index_from_firebase():
 
 
 
 
 
 
763
  build_index()
764
  else:
765
+ logger.info("Served from Firebase cache.")
766
+ _collect_existing()
767
+ threading.Thread(target=_watch, daemon=True).start()
768
+ logger.info("Watcher started.")
 
 
 
 
 
 
769
 
770
  with app.app_context():
771
  start_app()