Spaces:

rairo
/

marka-data-api

Running

App Files Files Community

rairo commited on Apr 20

Commit

06fc015

verified ·

1 Parent(s): f716c87

Update main.py

Browse files

Files changed (1) hide show

main.py +496 -662

main.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 import re
 import time
 import threading
 import numpy as np
 import fitz  # PyMuPDF
 from flask import Flask, request, jsonify
@@ -14,58 +15,83 @@ from sklearn.metrics.pairwise import cosine_similarity
 import firebase_admin
 from firebase_admin import credentials, db as firebase_db
-# --- CONFIGURATION ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-SYLLABI_DIR = "syllabi"
 PAST_EXAMS_DIR = "past_exams"
-# Google GenAI Config
-GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
 EMBEDDING_MODEL = "models/text-embedding-004"
-# --- GLOBAL STATE (IN-MEMORY CACHE) ---
-# Structure: { "A_9706": { "meta": {...}, "tree": [...] }, ... }
-SYLLABUS_MAP = {}
-# Structure: [ { "vector": [...], "meta": {...} } ]
-VECTOR_DB = []
-VECTOR_MATRIX = None  # Numpy array for fast math
-# Past exam index: { "A_9706": [ { paperId, year, session, fileUrl, pages: [...] }, ... ] }
-EXAM_MAP = {}
 app = Flask(__name__)
 CORS(app)
-# -----------------------------------------------------------------------------
-# 0. FIREBASE INITIALIZATION
-# -----------------------------------------------------------------------------
-firebase_db_ref = None
 def init_firebase():
-    global firebase_db_ref
     try:
-        credentials_json_string = os.environ.get("FIREBASE")
-        if not credentials_json_string:
-            logger.warning("FIREBASE env var not set. Firebase caching disabled.")
-            return False
-        credentials_json = json.loads(credentials_json_string)
-        firebase_db_url = os.environ.get("Firebase_DB")
-        if not firebase_db_url:
-            logger.warning("Firebase_DB env var not set. Firebase caching disabled.")
             return False
         if not firebase_admin._apps:
-            cred = credentials.Certificate(credentials_json)
-            firebase_admin.initialize_app(cred, {"databaseURL": firebase_db_url})
-        firebase_db_ref = firebase_db.reference()
-        logger.info("Firebase initialized successfully in Data API.")
         return True
     except Exception as e:
         logger.error(f"Firebase init failed: {e}")
@@ -73,190 +99,239 @@ def init_firebase():
 FIREBASE_AVAILABLE = init_firebase()
-def fb_set(path: str, data):
-    """Write to Firebase, silently fail if unavailable."""
-    if not FIREBASE_AVAILABLE or firebase_db_ref is None:
-        return
-    try:
-        firebase_db_ref.child(path).set(data)
-    except Exception as e:
-        logger.error(f"Firebase write failed [{path}]: {e}")
-def fb_get(path: str):
-    """Read from Firebase, return None if unavailable."""
-    if not FIREBASE_AVAILABLE or firebase_db_ref is None:
-        return None
-    try:
-        return firebase_db_ref.child(path).get()
     except Exception as e:
-        logger.error(f"Firebase read failed [{path}]: {e}")
         return None
-# -----------------------------------------------------------------------------
-# 1. BOILERPLATE PAGE DETECTION
-# -----------------------------------------------------------------------------
-# Keywords that identify non-content pages to skip
-BOILERPLATE_TITLE_PATTERNS = re.compile(
-    r'^\s*(about\s+(this\s+)?syllabus|foreword|acknowledgements?|introduction\s+to\s+(cambridge|zimsec)|'
-    r'how\s+to\s+use\s+this\s+syllabus|why\s+choose\s+cambridge|support\s+for\s+teachers|'
-    r'teacher\s+support|resource\s+list|list\s+of\s+resources|further\s+information|'
-    r'copyright|legal\s+notice|syllabus\s+overview\s+at\s+a\s+glance|'
-    r'assessment\s+at\s+a\s+glance|grade\s+descriptions|mathematical\s+notation|'
-    r'command\s+words|glossary\s+of\s+command|changes\s+to\s+this\s+syllabus|'
-    r'other\s+cambridge|university\s+of\s+cambridge|cambridge\s+assessment|'
-    r'published\s+by|contents\s*$|table\s+of\s+contents)\s*$',
     re.IGNORECASE
 )
-# Keywords that signal content has actually started
-CONTENT_START_PATTERNS = re.compile(
-    r'^\s*((syllabus\s+)?content|subject\s+content|unit\s+\d|topic\s+\d|'
-    r'section\s+\d|module\s+\d|\d+\s+[A-Z]|component\s+\d|paper\s+\d|'
-    r'scheme\s+of\s+work|learning\s+objectives|knowledge.*understanding)',
     re.IGNORECASE
 )
-def is_boilerplate_block(text: str) -> bool:
-    """Returns True if this block is boilerplate/admin content to skip."""
-    return bool(BOILERPLATE_TITLE_PATTERNS.match(text.strip()))
-def page_is_boilerplate(page_text: str) -> bool:
-    """Returns True if the entire page appears to be admin/front-matter."""
-    lines = [l.strip() for l in page_text.splitlines() if l.strip()]
-    if not lines:
-        return True
-    # Check first substantive line
-    first = lines[0]
-    if BOILERPLATE_TITLE_PATTERNS.match(first):
-        return True
-    # Check if page is very short (< 5 lines) with no numbered items — likely a divider
-    if len(lines) < 5 and not re.search(r'\d+\.\d+|\d+\s+[A-Z]', page_text):
-        # Could be a section divider page — not boilerplate but also empty
-        pass
-    return False
-# -----------------------------------------------------------------------------
-# 2. THE PARSER ENGINE (Extracts Structure from PDF)
-# -----------------------------------------------------------------------------
 class PDFParser:
     def __init__(self, filepath):
-        self.filepath = filepath
-        self.filename = os.path.basename(filepath)
-        self.doc = fitz.open(filepath)
-        parts = filepath.replace("\\", "/").split("/")
-        self.level = parts[-2] if len(parts) > 1 else "General"
-        self.subject_code = re.search(r'\d{4}', self.filename)
-        self.subject_code = self.subject_code.group(0) if self.subject_code else "0000"
-        self.subject_name = re.sub(r'[_\-]\d{4}.*', '', self.filename.replace('_', ' ')).strip()
-        self.unique_id = f"{self.level}_{self.subject_code}"
-    def get_font_characteristics(self):
-        """Scans PDF to find the most common font size (body text)."""
-        font_sizes = {}
         for page in self.doc:
-            blocks = page.get_text("dict")["blocks"]
-            for b in blocks:
                 for l in b.get("lines", []):
                     for s in l.get("spans", []):
-                        size = round(s["size"], 1)
-                        font_sizes[size] = font_sizes.get(size, 0) + len(s["text"])
-        if not font_sizes:
-            return 10.0
-        return max(font_sizes, key=font_sizes.get)
-    def _find_content_start_page(self) -> int:
-        """
-        Scans pages to find where actual syllabus content begins.
-        Returns the 0-based page index.
-        """
-        for page_num, page in enumerate(self.doc):
-            text = page.get_text("text")
-            # Skip empty pages
-            if len(text.strip()) < 30:
-                continue
-            # Skip boilerplate pages
-            if page_is_boilerplate(text):
-                continue
-            # Look for numbered content sections
-            if CONTENT_START_PATTERNS.search(text):
-                logger.info(f"  Content starts at page {page_num + 1} for {self.filename}")
-                return page_num
-            # Also check if this page has numbered topic headers (e.g. "1 Number" or "1.1 ...")
-            if re.search(r'\n\s*\d+\.?\d*\s+[A-Z][a-z]', text):
-                logger.info(f"  Content (numbered) starts at page {page_num + 1} for {self.filename}")
-                return page_num
-        # Fallback: skip first 10% of pages (usually all front-matter)
-        fallback = max(1, len(self.doc) // 10)
-        logger.info(f"  Using fallback content start page {fallback + 1} for {self.filename}")
-        return fallback
     def parse(self):
-        body_size = self.get_font_characteristics()
-        content_start = self._find_content_start_page()
-        logger.info(f"Parsing {self.filename} (Body size ~{body_size}pt, content from page {content_start + 1})")
-        syllabus_tree = []
-        current_topic = None
-        current_subtopic = None
-        topic_pattern = re.compile(r'^(\d+\.?\s|Key Question\s)', re.IGNORECASE)
         for page_num, page in enumerate(self.doc):
-            # Skip pre-content pages entirely
-            if page_num < content_start:
                 continue
-            blocks = page.get_text("dict")["blocks"]
-            for b in blocks:
                 block_text = ""
-                max_size = 0
-                is_bold = False
                 for l in b.get("lines", []):
                     for s in l.get("spans", []):
-                        text = s["text"].strip()
-                        if not text:
-                            continue
-                        block_text += text + " "
-                        if s["size"] > max_size:
-                            max_size = s["size"]
-                        if "bold" in s["font"].lower():
-                            is_bold = True
                 block_text = block_text.strip()
-                if len(block_text) < 3:
-                    continue
-                # Skip boilerplate blocks even within content pages
-                if is_boilerplate_block(block_text):
                     continue
-                # HEURISTIC 1: TOPIC (Large Header — 2pt+ above body)
                 if max_size > body_size + 2:
                     if current_subtopic and current_topic:
                         current_topic["children"].append(current_subtopic)
                         current_subtopic = None
                     if current_topic:
                         syllabus_tree.append(current_topic)
                     current_topic = {
-                        "id": f"{self.unique_id}_{len(syllabus_tree)}",
-                        "title": block_text,
-                        "type": "topic",
                         "children": []
                     }
                     current_subtopic = None
-                # HEURISTIC 2: SUBTOPIC (Bold, numbered, or keyword-led)
                 elif (is_bold and max_size >= body_size) or \
                      (topic_pattern.match(block_text) and max_size >= body_size):
                     if current_subtopic and current_topic:
                         current_topic["children"].append(current_subtopic)
                     if not current_topic:
                         current_topic = {
                             "id": f"{self.unique_id}_root",
@@ -264,27 +339,25 @@ class PDFParser:
                             "type": "topic",
                             "children": []
                         }
                     current_subtopic = {
-                        "id": f"{current_topic['id']}_{len(current_topic['children'])}",
-                        "title": block_text,
-                        "type": "subtopic",
                         "content": []
                     }
-                # HEURISTIC 3: CONTENT (Body Text)
                 elif max_size <= body_size + 1:
                     if current_subtopic:
                         current_subtopic["content"].append(block_text)
                     elif current_topic:
                         current_subtopic = {
-                            "id": f"{current_topic['id']}_intro",
-                            "title": "Overview",
-                            "type": "subtopic",
                             "content": [block_text]
                         }
-        # Flush remainders
         if current_subtopic and current_topic:
             current_topic["children"].append(current_subtopic)
         if current_topic:
@@ -292,646 +365,407 @@ class PDFParser:
         return {
             "meta": {
-                "id": self.unique_id,
-                "subject": self.subject_name,
-                "code": self.subject_code,
-                "level": self.level,
-                "filename": self.filename,
                 "indexed_at": int(time.time())
             },
             "tree": syllabus_tree
         }
-# -----------------------------------------------------------------------------
-# 3. PAST EXAM PAPER PARSER
-# -----------------------------------------------------------------------------
 class ExamPaperParser:
-    """
-    Extracts metadata and full text from past exam PDFs.
-    Expected naming: syllabi_code_year_session_paper.pdf
-    E.g.:  9702_2023_May_Paper1.pdf  or  9702_2023_s1.pdf
-    Falls back to filename parsing when possible.
-    """
     def __init__(self, filepath):
-        self.filepath = filepath
-        self.filename = os.path.basename(filepath)
-        self.doc = fitz.open(filepath)
-        parts = filepath.replace("\\", "/").split("/")
-        self.level = parts[-2] if len(parts) > 1 else "General"
-        # Parse subject code from filename
-        code_match = re.search(r'\b(\d{4})\b', self.filename)
-        self.subject_code = code_match.group(1) if code_match else "0000"
-        self.unique_id = f"{self.level}_{self.subject_code}"
-        # Parse year
-        year_match = re.search(r'\b(20\d{2}|19\d{2})\b', self.filename)
-        self.year = year_match.group(1) if year_match else "Unknown"
-        # Parse session (May/June, Oct/Nov, etc.)
-        session_match = re.search(
-            r'(may[_\-]?june|oct[_\-]?nov|feb[_\-]?mar|summer|winter|s\d|w\d|m\d)',
-            self.filename, re.IGNORECASE
-        )
-        self.session = session_match.group(1).upper() if session_match else "Unknown"
-        # Parse paper number
-        paper_match = re.search(r'[_\-]p(\d)|paper[\s_\-]?(\d)', self.filename, re.IGNORECASE)
-        if paper_match:
-            self.paper_num = paper_match.group(1) or paper_match.group(2)
-        else:
-            self.paper_num = "1"
-        self.paper_id = f"{self.unique_id}_{self.year}_{self.session}_P{self.paper_num}"
     def extract_pages(self):
-        """Extract text per page."""
-        pages = []
-        for i, page in enumerate(self.doc):
-            text = page.get_text("text").strip()
-            if text:
-                pages.append({
-                    "page": i + 1,
-                    "text": text[:3000]  # cap per page to avoid huge payloads
-                })
-        return pages
     def extract_questions(self):
-        """
-        Heuristic: questions usually start with a number followed by a period/bracket.
-        E.g. "1." or "1 " or "(a)" at start of paragraph.
-        Returns list of { number, text }.
-        """
-        questions = []
-        full_text = "\n".join(p["text"] for p in self.extract_pages())
-        # Split by question numbers
-        q_pattern = re.compile(
-            r'(?:^|\n)\s*(\d{1,2})\s*[\.\)]\s+(.+?)(?=\n\s*\d{1,2}\s*[\.\)]|\Z)',
-            re.DOTALL | re.MULTILINE
-        )
-        for m in q_pattern.finditer(full_text):
-            q_num = int(m.group(1))
-            q_text = m.group(2).strip()
-            if len(q_text) > 20:  # filter noise
-                questions.append({"number": q_num, "text": q_text[:2000]})
-        return questions
     def parse(self):
-        pages = self.extract_pages()
-        questions = self.extract_questions()
         return {
             "meta": {
-                "paperId": self.paper_id,
-                "subjectId": self.unique_id,
                 "subjectCode": self.subject_code,
-                "level": self.level,
-                "year": self.year,
-                "session": self.session,
                 "paperNumber": self.paper_num,
-                "filename": self.filename,
-                "totalPages": len(self.doc),
-                "indexed_at": int(time.time())
             },
-            "pages": pages,
-            "questions": questions
         }
-# -----------------------------------------------------------------------------
-# 4. THE VECTOR ENGINE (Embeddings & Search)
-# -----------------------------------------------------------------------------
 def generate_embeddings(texts):
-    """Generates embeddings using Gemini API."""
-    if not GEMINI_API_KEY:
-        logger.warning("No Gemini API Key. Using dummy vectors.")
         return [np.zeros(768).tolist() for _ in texts]
-    client_g = genai.Client(api_key=GEMINI_API_KEY)
     results = []
-    batch_size = 10
-    for i in range(0, len(texts), batch_size):
-        batch = texts[i:i + batch_size]
         try:
-            resp = client_g.models.embed_content(
-                model=EMBEDDING_MODEL,
-                contents=batch,
-            )
-            for embedding in resp.embeddings:
-                results.append(embedding.values)
         except Exception as e:
-            logger.error(f"Embedding batch {i} failed: {e}")
             for _ in batch:
                 results.append(np.zeros(768).tolist())
     return results
-# -----------------------------------------------------------------------------
-# 5. FIREBASE-BACKED INDEX BUILDER
-# -----------------------------------------------------------------------------
 def load_index_from_firebase():
-    """
-    Tries to load the full index from Firebase.
-    Returns True if successfully loaded.
-    """
     global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
-    if not FIREBASE_AVAILABLE:
-        return False
-    logger.info("Attempting to load index from Firebase...")
     try:
-        # Load syllabus map
         fb_syllabi = fb_get("data_api/syllabi")
-        if not fb_syllabi:
-            logger.info("No syllabus data in Firebase yet.")
-            return False
         SYLLABUS_MAP = fb_syllabi
-        # Load vector DB
         fb_vectors = fb_get("data_api/vectors")
-        if not fb_vectors:
-            logger.info("No vector data in Firebase yet.")
-            return False
-        VECTOR_DB = []
-        valid_vectors = []
-        for entry in fb_vectors.values() if isinstance(fb_vectors, dict) else fb_vectors:
-            if not entry:
-                continue
             vec = np.array(entry["vector"])
-            VECTOR_DB.append({
-                "vector": vec,
-                "meta": entry["meta"]
-            })
-            valid_vectors.append(vec)
-        if valid_vectors:
-            VECTOR_MATRIX = np.vstack(valid_vectors)
-        # Load exam map
         fb_exams = fb_get("data_api/exams")
         if fb_exams:
             EXAM_MAP = fb_exams
-        logger.info(
-            f"Loaded from Firebase: {len(SYLLABUS_MAP)} syllabi, "
-            f"{len(VECTOR_DB)} vectors, {len(EXAM_MAP)} exam subjects."
-        )
         return True
     except Exception as e:
-        logger.error(f"Failed to load from Firebase: {e}")
         return False
-def save_syllabus_to_firebase(subject_id: str, data: dict):
-    """Save a single syllabus entry to Firebase."""
-    # Store tree without numpy arrays (just plain dicts)
-    fb_set(f"data_api/syllabi/{subject_id}", data)
-def save_vectors_to_firebase(vector_entries: list):
-    """Save vector entries to Firebase (store as lists, not numpy)."""
     fb_data = {}
-    for i, entry in enumerate(vector_entries):
-        key = f"v_{i:06d}"
-        fb_data[key] = {
             "vector": entry["vector"].tolist() if isinstance(entry["vector"], np.ndarray) else entry["vector"],
-            "meta": entry["meta"]
         }
     fb_set("data_api/vectors", fb_data)
-def save_exam_to_firebase(subject_id: str, paper_data: dict):
-    """Save a parsed exam paper under the subject's exam list."""
-    paper_id = paper_data["meta"]["paperId"]
-    # Sanitize key
-    safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
-    fb_set(f"data_api/exams/{subject_id}/{safe_key}", paper_data)
 def build_index():
-    """
-    Walks directories, parses PDFs, builds JSON tree and Vector Index,
-    then persists everything to Firebase.
-    """
     global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
-    logger.info("🚀 Starting Build Process...")
-    # ---- SYLLABI ----
     parsed_data = []
     if os.path.exists(SYLLABI_DIR):
-        for root, dirs, files in os.walk(SYLLABI_DIR):
-            for file in sorted(files):
-                if file.endswith(".pdf"):
-                    path = os.path.join(root, file)
-                    logger.info(f"Parsing syllabus: {path}")
-                    try:
-                        parser = PDFParser(path)
-                        data = parser.parse()
-                        parsed_data.append(data)
-                        SYLLABUS_MAP[data["meta"]["id"]] = data
-                        save_syllabus_to_firebase(data["meta"]["id"], data)
-                    except Exception as e:
-                        logger.error(f"Failed to parse {path}: {e}")
-    else:
-        logger.warning(f"Directory {SYLLABI_DIR} not found.")
-    # ---- PAST EXAMS ----
     if os.path.exists(PAST_EXAMS_DIR):
-        for root, dirs, files in os.walk(PAST_EXAMS_DIR):
-            for file in sorted(files):
-                if file.endswith(".pdf"):
-                    path = os.path.join(root, file)
-                    logger.info(f"Parsing exam paper: {path}")
-                    try:
-                        parser = ExamPaperParser(path)
-                        exam_data = parser.parse()
-                        subject_id = exam_data["meta"]["subjectId"]
-                        if subject_id not in EXAM_MAP:
-                            EXAM_MAP[subject_id] = {}
-                        paper_id = exam_data["meta"]["paperId"]
-                        safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
-                        EXAM_MAP[subject_id][safe_key] = exam_data
-                        save_exam_to_firebase(subject_id, exam_data)
-                    except Exception as e:
-                        logger.error(f"Failed to parse exam {path}: {e}")
-    else:
-        logger.info(f"No past_exams directory found at {PAST_EXAMS_DIR}. Skipping.")
-    # ---- VECTORIZATION (syllabi only) ----
     if not parsed_data:
-        logger.info("No new syllabus data to vectorize.")
         return
-    chunks_to_embed = []
-    chunk_metadata = []
     for item in parsed_data:
-        meta_base = item["meta"]
         for topic in item["tree"]:
             for sub in topic.get("children", []):
-                text_blob = "\n".join(sub.get("content", []))
-                if len(text_blob) < 10:
-                    continue
-                rich_text = (
-                    f"{meta_base['subject']} {meta_base['level']} "
-                    f"- {topic['title']} - {sub['title']}:\n{text_blob}"
-                )
-                chunks_to_embed.append(rich_text)
-                chunk_metadata.append({
-                    "subject_id": meta_base["id"],
-                    "topic_id": topic["id"],
                     "subtopic_id": sub["id"],
-                    "title": sub["title"],
-                    "content": text_blob
                 })
-    logger.info(f"🧮 Generating embeddings for {len(chunks_to_embed)} chunks...")
-    vectors = generate_embeddings(chunks_to_embed)
     VECTOR_DB = []
-    valid_vectors = []
-    for i, vec in enumerate(vectors):
-        np_vec = np.array(vec)
-        VECTOR_DB.append({
-            "vector": np_vec,
-            "meta": chunk_metadata[i]
-        })
-        valid_vectors.append(np_vec)
-    if valid_vectors:
-        VECTOR_MATRIX = np.vstack(valid_vectors)
-    # Persist to Firebase
-    save_vectors_to_firebase(VECTOR_DB)
-    logger.info(
-        f"✅ Indexing Complete. "
-        f"{len(SYLLABUS_MAP)} syllabi, {len(VECTOR_DB)} vectors, "
-        f"{sum(len(v) for v in EXAM_MAP.values())} exam papers."
-    )
-# -----------------------------------------------------------------------------
-# 6. DIRECTORY WATCHER — Auto-index new PDFs
-# -----------------------------------------------------------------------------
 _indexed_files = set()
-def _collect_existing_files():
-    """Collect all currently-present PDFs to avoid re-indexing on boot."""
     for d in [SYLLABI_DIR, PAST_EXAMS_DIR]:
-        if not os.path.exists(d):
-            continue
         for root, _, files in os.walk(d):
             for f in files:
                 if f.endswith(".pdf"):
                     _indexed_files.add(os.path.join(root, f))
-def _watch_directories(interval=30):
-    """Background thread: detect new PDFs and index them."""
     while True:
         time.sleep(interval)
         for directory, is_exam in [(SYLLABI_DIR, False), (PAST_EXAMS_DIR, True)]:
-            if not os.path.exists(directory):
-                continue
             for root, _, files in os.walk(directory):
-                for file in files:
-                    if not file.endswith(".pdf"):
-                        continue
-                    path = os.path.join(root, file)
-                    if path in _indexed_files:
-                        continue
-                    logger.info(f"🆕 New PDF detected: {path}")
                     _indexed_files.add(path)
                     try:
                         if is_exam:
-                            parser = ExamPaperParser(path)
                             exam_data = parser.parse()
-                            subject_id = exam_data["meta"]["subjectId"]
-                            if subject_id not in EXAM_MAP:
-                                EXAM_MAP[subject_id] = {}
-                            paper_id = exam_data["meta"]["paperId"]
-                            safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
-                            EXAM_MAP[subject_id][safe_key] = exam_data
-                            save_exam_to_firebase(subject_id, exam_data)
                         else:
                             parser = PDFParser(path)
-                            data = parser.parse()
                             SYLLABUS_MAP[data["meta"]["id"]] = data
-                            save_syllabus_to_firebase(data["meta"]["id"], data)
-                            # Re-vectorize just this document
                             _incremental_vectorize(data)
                     except Exception as e:
-                        logger.error(f"Error indexing new file {path}: {e}")
-def _incremental_vectorize(syllabus_data: dict):
-    """Add vectors for a single newly-uploaded syllabus."""
-    global VECTOR_DB, VECTOR_MATRIX
-    meta_base = syllabus_data["meta"]
-    chunks = []
-    metas = []
-    for topic in syllabus_data["tree"]:
-        for sub in topic.get("children", []):
-            text_blob = "\n".join(sub.get("content", []))
-            if len(text_blob) < 10:
-                continue
-            rich_text = (
-                f"{meta_base['subject']} {meta_base['level']} "
-                f"- {topic['title']} - {sub['title']}:\n{text_blob}"
-            )
-            chunks.append(rich_text)
-            metas.append({
-                "subject_id": meta_base["id"],
-                "topic_id": topic["id"],
-                "subtopic_id": sub["id"],
-                "title": sub["title"],
-                "content": text_blob
-            })
-    if not chunks:
-        return
-    vectors = generate_embeddings(chunks)
-    for i, vec in enumerate(vectors):
-        np_vec = np.array(vec)
-        VECTOR_DB.append({"vector": np_vec, "meta": metas[i]})
-    if VECTOR_DB:
-        VECTOR_MATRIX = np.vstack([e["vector"] for e in VECTOR_DB])
-    # Persist full updated vector set
-    save_vectors_to_firebase(VECTOR_DB)
-    logger.info(f"Incremental vectorize complete for {meta_base['id']}.")
-# -----------------------------------------------------------------------------
-# 7. API ENDPOINTS
-# -----------------------------------------------------------------------------
 @app.route('/health', methods=['GET'])
 def health():
     return jsonify({
-        "status": "online",
         "subjects_loaded": list(SYLLABUS_MAP.keys()),
-        "vector_chunks": len(VECTOR_DB),
-        "exam_subjects": list(EXAM_MAP.keys()),
-        "firebase": FIREBASE_AVAILABLE
     })
 @app.route('/v1/structure/<subject_id>', methods=['GET'])
 def get_structure(subject_id):
-    """Returns the static JSON tree for navigation UI."""
     data = SYLLABUS_MAP.get(subject_id)
     if not data:
         return jsonify({"error": "Subject not found"}), 404
     return jsonify(data)
-@app.route('/v1/subjects', methods=['GET'])
-def list_subjects():
-    """Returns metadata for all indexed syllabi."""
-    result = []
-    for sid, data in SYLLABUS_MAP.items():
-        result.append(data.get("meta", {"id": sid}))
-    return jsonify(result)
 @app.route('/v1/search', methods=['POST'])
 def search():
-    """
-    Semantic Retrieval.
-    Input: { "query": "...", "filter_subject_id": "..." (optional) }
-    """
-    if VECTOR_MATRIX is None or len(VECTOR_DB) == 0:
         return jsonify({"error": "Index not ready"}), 503
-    data = request.json or {}
-    query = data.get("query")
-    subject_filter = data.get("filter_subject_id")
-    if not query:
         return jsonify({"error": "Query required"}), 400
-    if not GEMINI_API_KEY:
         return jsonify({"error": "Embedding API not configured"}), 503
-    client_g = genai.Client(api_key=GEMINI_API_KEY)
     try:
-        resp = client_g.models.embed_content(model=EMBEDDING_MODEL, contents=query)
-        query_vec = np.array(resp.embeddings[0].values).reshape(1, -1)
     except Exception as e:
         return jsonify({"error": str(e)}), 500
-    scores = cosine_similarity(query_vec, VECTOR_MATRIX)[0]
-    top_indices = np.argsort(scores)[::-1]
     results = []
-    count = 0
-    for idx in top_indices:
-        if scores[idx] < 0.3:
-            break
-        entry = VECTOR_DB[idx]
-        meta = entry["meta"]
-        if subject_filter and meta["subject_id"] != subject_filter:
-            continue
-        results.append({
-            "score": float(scores[idx]),
-            "subject_id": meta["subject_id"],
-            "title": meta["title"],
-            "content": meta["content"],
-            "node_id": meta["subtopic_id"]
-        })
-        count += 1
-        if count >= 5:
-            break
     return jsonify({"results": results})
 @app.route('/v1/exams', methods=['GET'])
 def list_exams():
-    """
-    List past exam papers.
-    Query param: subject_id (optional)
-    """
-    subject_id = request.args.get("subject_id")
-    if subject_id:
-        papers = EXAM_MAP.get(subject_id, {})
-        result = [p["meta"] for p in papers.values() if isinstance(p, dict) and "meta" in p]
-    else:
-        result = []
-        for sid, papers in EXAM_MAP.items():
-            for p in papers.values():
-                if isinstance(p, dict) and "meta" in p:
-                    result.append(p["meta"])
-    return jsonify(result)
 @app.route('/v1/exams/<paper_id>', methods=['GET'])
 def get_exam(paper_id):
-    """
-    Get full exam paper (pages + questions).
-    paper_id format: A_9702_2023_MAY_P1
-    """
-    safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
-    for sid, papers in EXAM_MAP.items():
         for key, paper in papers.items():
-            if key == safe_key or (isinstance(paper, dict) and
-               paper.get("meta", {}).get("paperId") == paper_id):
                 return jsonify(paper)
-    return jsonify({"error": "Exam paper not found"}), 404
 @app.route('/v1/exams/<paper_id>/questions', methods=['GET'])
 def get_exam_questions(paper_id):
-    """Get just the extracted questions from a past paper."""
-    safe_key = re.sub(r'[.\[\]#$/]', '_', paper_id)
-    for sid, papers in EXAM_MAP.items():
         for key, paper in papers.items():
-            if key == safe_key or (isinstance(paper, dict) and
-               paper.get("meta", {}).get("paperId") == paper_id):
-                return jsonify({
-                    "paperId": paper_id,
-                    "meta": paper.get("meta"),
-                    "questions": paper.get("questions", [])
-                })
-    return jsonify({"error": "Exam paper not found"}), 404
 @app.route('/v1/rebuild', methods=['POST'])
 def trigger_rebuild():
-    """
-    Trigger a full index rebuild (admin use).
-    Optionally pass { "force": true } to bypass Firebase cache.
-    """
-    auth_header = request.headers.get("Authorization", "")
-    rebuild_key = os.environ.get("REBUILD_SECRET", "")
-    if rebuild_key and auth_header != f"Bearer {rebuild_key}":
         return jsonify({"error": "Unauthorized"}), 401
-    def _rebuild_bg():
         global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
-        SYLLABUS_MAP = {}
-        VECTOR_DB = []
-        VECTOR_MATRIX = None
-        EXAM_MAP = {}
         build_index()
-    t = threading.Thread(target=_rebuild_bg, daemon=True)
-    t.start()
     return jsonify({"status": "rebuild started"}), 202
-# -----------------------------------------------------------------------------
-# 8. STARTUP BOOTSTRAP
-# -----------------------------------------------------------------------------
 def start_app():
-    # Create directories if needed
     for d in [SYLLABI_DIR, PAST_EXAMS_DIR]:
         if not os.path.exists(d):
             os.makedirs(os.path.join(d, "A"), exist_ok=True)
             os.makedirs(os.path.join(d, "O"), exist_ok=True)
-            logger.info(f"Created empty directory: {d}")
-    # Try to load from Firebase first
-    loaded = load_index_from_firebase()
-    if not loaded:
-        # Build from scratch
         build_index()
     else:
-        logger.info("Served from Firebase cache. Skipping full rebuild.")
-    # Collect existing files so the watcher doesn't re-index them
-    _collect_existing_files()
-    # Start background watcher for new uploads
-    watcher = threading.Thread(target=_watch_directories, daemon=True)
-    watcher.start()
-    logger.info("Directory watcher started.")
 with app.app_context():
     start_app()

 import re
 import time
 import threading
+import base64
 import numpy as np
 import fitz  # PyMuPDF
 from flask import Flask, request, jsonify
 import firebase_admin
 from firebase_admin import credentials, db as firebase_db
+# ---------------------------------------------------------------------------
+# CONFIGURATION
+# ---------------------------------------------------------------------------
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+SYLLABI_DIR    = "syllabi"
 PAST_EXAMS_DIR = "past_exams"
+GEMINI_API_KEY  = os.environ.get("GEMINI_API_KEY")
 EMBEDDING_MODEL = "models/text-embedding-004"
+VISION_MODEL    = "gemini-2.5-flash"
+# ---------------------------------------------------------------------------
+# COMPLETE SUBJECT REGISTRY  (all 24 PDFs on HuggingFace)
+# ---------------------------------------------------------------------------
+A_LEVEL_SUBJECTS = {
+    "A_9706": "Accounting",
+    "A_9700": "Biology",
+    "A_9609": "Business",
+    "A_9701": "Chemistry",
+    "A_9618": "Computer Science",
+    "A_9708": "Economics",
+    "A_9231": "Further Mathematics",
+    "A_9489": "History",
+    "A_9695": "Literature in English",
+    "A_9709": "Mathematics",
+    "A_9702": "Physics",
+    "A_9699": "Sociology",
+    "A_9395": "Travel and Tourism",
+}
+O_LEVEL_SUBJECTS = {
+    "O_0452": "Accounting",
+    "O_0610": "Biology",
+    "O_0450": "Business Studies",
+    "O_0620": "Chemistry",
+    "O_0478": "Computer Science",
+    "O_0500": "English Language",
+    "O_0475": "English Literature",
+    "O_0680": "Environmental Management",
+    "O_0460": "Geography",
+    "O_0470": "History",
+    "O_0625": "Physics",
+}
+ALL_SUBJECTS = {**A_LEVEL_SUBJECTS, **O_LEVEL_SUBJECTS}
+# ---------------------------------------------------------------------------
+# GLOBAL STATE
+# ---------------------------------------------------------------------------
+SYLLABUS_MAP  = {}
+VECTOR_DB     = []
+VECTOR_MATRIX = None
+EXAM_MAP      = {}
 app = Flask(__name__)
 CORS(app)
+# ---------------------------------------------------------------------------
+# FIREBASE
+# ---------------------------------------------------------------------------
+firebase_db_ref    = None
+FIREBASE_AVAILABLE = False
 def init_firebase():
+    global firebase_db_ref, FIREBASE_AVAILABLE
     try:
+        creds_str = os.environ.get("FIREBASE")
+        db_url    = os.environ.get("Firebase_DB")
+        if not creds_str or not db_url:
+            logger.warning("Firebase env vars missing.")
             return False
         if not firebase_admin._apps:
+            cred = credentials.Certificate(json.loads(creds_str))
+            firebase_admin.initialize_app(cred, {"databaseURL": db_url})
+        firebase_db_ref    = firebase_db.reference()
+        FIREBASE_AVAILABLE = True
+        logger.info("Firebase initialised (Data API).")
         return True
     except Exception as e:
         logger.error(f"Firebase init failed: {e}")
 FIREBASE_AVAILABLE = init_firebase()
+def fb_set(path, data):
+    if not FIREBASE_AVAILABLE: return
+    try: firebase_db_ref.child(path).set(data)
+    except Exception as e: logger.error(f"FB write [{path}]: {e}")
+def fb_get(path):
+    if not FIREBASE_AVAILABLE: return None
+    try: return firebase_db_ref.child(path).get()
     except Exception as e:
+        logger.error(f"FB read [{path}]: {e}")
         return None
+# ---------------------------------------------------------------------------
+# GEMINI CLIENT
+# ---------------------------------------------------------------------------
+_gemini_client = None
+def get_gemini():
+    global _gemini_client
+    if _gemini_client is None and GEMINI_API_KEY:
+        _gemini_client = genai.Client(api_key=GEMINI_API_KEY)
+    return _gemini_client
+# ---------------------------------------------------------------------------
+# VISION-BASED PAGE CLASSIFIER
+# Renders each page as an image and asks Gemini to classify it.
+# Falls back to heuristic if vision call fails or key is absent.
+# ---------------------------------------------------------------------------
+DEFINITE_BOILERPLATE_RE = re.compile(
+    r'^\s*(about\s+this\s+syllabus|foreword|acknowledgements?|'
+    r'why\s+choose\s+(cambridge|zimsec|this\s+syllabus)|cambridge\s+learner|'
+    r'key\s+benefits?|how\s+to\s+use\s+this\s+syllabus|'
+    r'support\s+for\s+(cambridge|teachers)|resource\s+list|'
+    r'further\s+information|copyright|legal\s+notice|'
+    r'changes\s+to\s+this\s+syllabus|university\s+of\s+cambridge|'
+    r'cambridge\s+assessment\s+international|published\s+by|'
+    r'contents?\s*$|table\s+of\s+contents?|'
+    r'assessment\s+at\s+a\s+glance|syllabus\s+at\s+a\s+glance|'
+    r'grade\s+descriptions?|command\s+words|glossary\s+of\s+command|'
+    r'mathematical\s+notation|other\s+cambridge\s+qualifications|'
+    r'how\s+to\s+offer|progression|post[-\s]?qualification|'
+    r'school\s+supported\s+candidate|cambridge\s+primary|cambridge\s+lower\s+secondary)\s*$',
     re.IGNORECASE
 )
+CONTENT_START_RE = re.compile(
+    r'(^|\n)\s*(\d+\.?\d*\s+[A-Z][a-z]|\d+\s+[A-Z][a-z]|'
+    r'subject\s+content|'
+    r'unit\s+\d|topic\s+\d|section\s+\d|module\s+\d|'
+    r'component\s+\d|paper\s+\d|'
+    r'learning\s+objectives|knowledge\s+and\s+understanding|'
+    r'candidates\s+should\s+be\s+able)',
     re.IGNORECASE
 )
+def _page_to_base64_png(page, dpi=72) -> str:
+    mat  = fitz.Matrix(dpi / 72, dpi / 72)
+    pix  = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
+    return base64.b64encode(pix.tobytes("png")).decode("utf-8")
+def _vision_classify_page(page, page_num: int, subject_name: str) -> str:
+    """Returns 'boilerplate', 'content', or 'uncertain'."""
+    client = get_gemini()
+    if client is None:
+        return "uncertain"
+    try:
+        b64    = _page_to_base64_png(page)
+        prompt = (
+            f"This is page {page_num + 1} of a Cambridge International AS & A Level / "
+            f"IGCSE syllabus for {subject_name}.\n\n"
+            "Classify this page as ONE of:\n"
+            "BOILERPLATE - administrative or introductory content: foreword, about this "
+            "syllabus, why choose Cambridge, key benefits, Cambridge learner attributes, "
+            "how to use this syllabus, table of contents, copyright, assessment overview "
+            "tables, grade descriptions, command words, mathematical notation appendix, "
+            "support information, changes to syllabus, qualification overview.\n"
+            "CONTENT - actual subject matter students must learn: topic lists, learning "
+            "objectives, numbered content sections, subject-specific knowledge points, "
+            "skills, practical work descriptions, candidate assessment criteria.\n\n"
+            "Reply with exactly one word: BOILERPLATE or CONTENT"
+        )
+        resp = client.models.generate_content(
+            model=VISION_MODEL,
+            contents=[{"role": "user", "parts": [
+                {"inline_data": {"mime_type": "image/png", "data": b64}},
+                {"text": prompt}
+            ]}]
+        )
+        answer = (resp.text or "").strip().upper()
+        if "BOILERPLATE" in answer: return "boilerplate"
+        if "CONTENT" in answer:     return "content"
+        return "uncertain"
+    except Exception as e:
+        logger.warning(f"Vision classify page {page_num}: {e}")
+        return "uncertain"
+def classify_all_pages(doc, subject_name: str) -> list:
+    """
+    Returns list of 'boilerplate' or 'content' for each page.
+    Uses vision for first 40 pages, heuristic after that.
+    Caches result to avoid re-classifying on incremental runs.
+    """
+    classifications = []
+    n = len(doc)
+    for i, page in enumerate(doc):
+        text = page.get_text("text").strip()
+        # Pages beyond the front-matter zone are almost always content
+        if i >= 40:
+            classifications.append("content")
+            continue
+        # Hard-rule catch
+        first_lines = [l.strip() for l in text.splitlines() if l.strip()][:3]
+        if first_lines and DEFINITE_BOILERPLATE_RE.match(first_lines[0]):
+            classifications.append("boilerplate")
+            continue
+        # Empty page
+        if len(text) < 30:
+            classifications.append("boilerplate")
+            continue
+        # Vision call
+        verdict = _vision_classify_page(page, i, subject_name)
+        if verdict == "uncertain":
+            verdict = "content" if CONTENT_START_RE.search(text) else "boilerplate"
+        classifications.append(verdict)
+        logger.info(f"  Page {i+1}/{n}: {verdict}")
+    # Safety: if vision misclassified everything as boilerplate, use heuristic fallback
+    if not any(c == "content" for c in classifications):
+        logger.warning(f"  All pages BOILERPLATE for {subject_name} — applying heuristic fallback.")
+        classifications = []
+        found_content = False
+        for i, page in enumerate(doc):
+            text = page.get_text("text")
+            if not found_content and CONTENT_START_RE.search(text):
+                found_content = True
+            classifications.append("content" if found_content else "boilerplate")
+    return classifications
+# ---------------------------------------------------------------------------
+# PDF PARSER  — Vision-enhanced
+# ---------------------------------------------------------------------------
 class PDFParser:
     def __init__(self, filepath):
+        self.filepath     = filepath
+        self.filename     = os.path.basename(filepath)
+        self.doc          = fitz.open(filepath)
+        parts             = filepath.replace("\\", "/").split("/")
+        self.level        = parts[-2] if len(parts) > 1 else "General"
+        code_m            = re.search(r'\d{4}', self.filename)
+        self.subject_code = code_m.group(0) if code_m else "0000"
+        self.unique_id    = f"{self.level}_{self.subject_code}"
+        self.subject_name = ALL_SUBJECTS.get(
+            self.unique_id,
+            re.sub(r'[_\-]\d{4}.*', '', self.filename.replace('_', ' ')).strip()
+        )
+    def get_body_font_size(self):
+        sizes = {}
         for page in self.doc:
+            for b in page.get_text("dict")["blocks"]:
                 for l in b.get("lines", []):
                     for s in l.get("spans", []):
+                        sz = round(s["size"], 1)
+                        sizes[sz] = sizes.get(sz, 0) + len(s["text"])
+        return max(sizes, key=sizes.get) if sizes else 10.0
     def parse(self):
+        body_size     = self.get_body_font_size()
+        page_classes  = classify_all_pages(self.doc, self.subject_name)
+        topic_pattern = re.compile(r'^(\d+\.?\s|Key\s+Question\s)', re.IGNORECASE)
+        logger.info(f"Parsing content of {self.filename} (body ~{body_size}pt)")
+        content_page_count = sum(1 for c in page_classes if c == "content")
+        logger.info(f"  {content_page_count} content pages out of {len(self.doc)} total")
+        syllabus_tree    = []
+        current_topic    = None
+        current_subtopic = None
         for page_num, page in enumerate(self.doc):
+            if page_classes[page_num] == "boilerplate":
                 continue
+            for b in page.get_text("dict")["blocks"]:
                 block_text = ""
+                max_size   = 0
+                is_bold    = False
                 for l in b.get("lines", []):
                     for s in l.get("spans", []):
+                        t = s["text"].strip()
+                        if not t: continue
+                        block_text += t + " "
+                        if s["size"] > max_size: max_size = s["size"]
+                        if "bold" in s["font"].lower(): is_bold = True
                 block_text = block_text.strip()
+                if len(block_text) < 3: continue
+                # Skip residual boilerplate blocks within content pages
+                first_words = " ".join(block_text.split()[:6])
+                if DEFINITE_BOILERPLATE_RE.match(first_words):
                     continue
+                # TOPIC
                 if max_size > body_size + 2:
                     if current_subtopic and current_topic:
                         current_topic["children"].append(current_subtopic)
                         current_subtopic = None
                     if current_topic:
                         syllabus_tree.append(current_topic)
                     current_topic = {
+                        "id":       f"{self.unique_id}_{len(syllabus_tree)}",
+                        "title":    block_text,
+                        "type":     "topic",
                         "children": []
                     }
                     current_subtopic = None
+                # SUBTOPIC
                 elif (is_bold and max_size >= body_size) or \
                      (topic_pattern.match(block_text) and max_size >= body_size):
                     if current_subtopic and current_topic:
                         current_topic["children"].append(current_subtopic)
                     if not current_topic:
                         current_topic = {
                             "id": f"{self.unique_id}_root",
                             "type": "topic",
                             "children": []
                         }
                     current_subtopic = {
+                        "id":      f"{current_topic['id']}_{len(current_topic['children'])}",
+                        "title":   block_text,
+                        "type":    "subtopic",
                         "content": []
                     }
+                # BODY
                 elif max_size <= body_size + 1:
                     if current_subtopic:
                         current_subtopic["content"].append(block_text)
                     elif current_topic:
                         current_subtopic = {
+                            "id":      f"{current_topic['id']}_intro",
+                            "title":   "Overview",
+                            "type":    "subtopic",
                             "content": [block_text]
                         }
         if current_subtopic and current_topic:
             current_topic["children"].append(current_subtopic)
         if current_topic:
         return {
             "meta": {
+                "id":         self.unique_id,
+                "subject":    self.subject_name,
+                "code":       self.subject_code,
+                "level":      self.level,
+                "filename":   self.filename,
                 "indexed_at": int(time.time())
             },
             "tree": syllabus_tree
         }
+# ---------------------------------------------------------------------------
+# PAST EXAM PARSER
+# ---------------------------------------------------------------------------
 class ExamPaperParser:
     def __init__(self, filepath):
+        self.filepath     = filepath
+        self.filename     = os.path.basename(filepath)
+        self.doc          = fitz.open(filepath)
+        parts             = filepath.replace("\\", "/").split("/")
+        self.level        = parts[-2] if len(parts) > 1 else "General"
+        code_m            = re.search(r'\b(\d{4})\b', self.filename)
+        self.subject_code = code_m.group(1) if code_m else "0000"
+        self.unique_id    = f"{self.level}_{self.subject_code}"
+        year_m            = re.search(r'\b(20\d{2}|19\d{2})\b', self.filename)
+        self.year         = year_m.group(1) if year_m else "Unknown"
+        sess_m            = re.search(r'(may[_\-]?june|oct[_\-]?nov|feb[_\-]?mar|summer|winter|s\d|w\d|m\d)', self.filename, re.IGNORECASE)
+        self.session      = sess_m.group(1).upper() if sess_m else "Unknown"
+        paper_m           = re.search(r'[_\-]p(\d)|paper[\s_\-]?(\d)', self.filename, re.IGNORECASE)
+        self.paper_num    = (paper_m.group(1) or paper_m.group(2)) if paper_m else "1"
+        self.paper_id     = f"{self.unique_id}_{self.year}_{self.session}_P{self.paper_num}"
     def extract_pages(self):
+        return [{"page": i + 1, "text": p.get_text("text").strip()[:3000]}
+                for i, p in enumerate(self.doc) if p.get_text("text").strip()]
     def extract_questions(self):
+        full = "\n".join(p["text"] for p in self.extract_pages())
+        pat  = re.compile(r'(?:^|\n)\s*(\d{1,2})\s*[\.\)]\s+(.+?)(?=\n\s*\d{1,2}\s*[\.\)]|\Z)', re.DOTALL | re.MULTILINE)
+        return [{"number": int(m.group(1)), "text": m.group(2).strip()[:2000]}
+                for m in pat.finditer(full) if len(m.group(2).strip()) > 20]
     def parse(self):
         return {
             "meta": {
+                "paperId":     self.paper_id,
+                "subjectId":   self.unique_id,
                 "subjectCode": self.subject_code,
+                "level":       self.level,
+                "year":        self.year,
+                "session":     self.session,
                 "paperNumber": self.paper_num,
+                "filename":    self.filename,
+                "totalPages":  len(self.doc),
+                "indexed_at":  int(time.time())
             },
+            "pages":     self.extract_pages(),
+            "questions": self.extract_questions()
         }
+# ---------------------------------------------------------------------------
+# EMBEDDINGS
+# ---------------------------------------------------------------------------
 def generate_embeddings(texts):
+    client = get_gemini()
+    if client is None:
         return [np.zeros(768).tolist() for _ in texts]
     results = []
+    for i in range(0, len(texts), 10):
+        batch = texts[i:i + 10]
         try:
+            resp = client.models.embed_content(model=EMBEDDING_MODEL, contents=batch)
+            for emb in resp.embeddings:
+                results.append(emb.values)
         except Exception as e:
+            logger.error(f"Embed batch {i}: {e}")
             for _ in batch:
                 results.append(np.zeros(768).tolist())
     return results
+# ---------------------------------------------------------------------------
+# FIREBASE PERSISTENCE
+# ---------------------------------------------------------------------------
 def load_index_from_firebase():
     global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
+    if not FIREBASE_AVAILABLE: return False
+    logger.info("Loading index from Firebase ...")
     try:
         fb_syllabi = fb_get("data_api/syllabi")
+        if not fb_syllabi: return False
         SYLLABUS_MAP = fb_syllabi
         fb_vectors = fb_get("data_api/vectors")
+        if not fb_vectors: return False
+        VECTOR_DB  = []
+        valid      = []
+        for entry in (fb_vectors.values() if isinstance(fb_vectors, dict) else fb_vectors):
+            if not entry: continue
             vec = np.array(entry["vector"])
+            VECTOR_DB.append({"vector": vec, "meta": entry["meta"]})
+            valid.append(vec)
+        if valid:
+            VECTOR_MATRIX = np.vstack(valid)
         fb_exams = fb_get("data_api/exams")
         if fb_exams:
             EXAM_MAP = fb_exams
+        logger.info(f"Loaded: {len(SYLLABUS_MAP)} syllabi, {len(VECTOR_DB)} vectors, {len(EXAM_MAP)} exam subjects.")
         return True
     except Exception as e:
+        logger.error(f"Firebase load: {e}")
         return False
+def save_syllabus(sid, data):
+    fb_set(f"data_api/syllabi/{sid}", data)
+def save_all_vectors():
     fb_data = {}
+    for i, entry in enumerate(VECTOR_DB):
+        fb_data[f"v_{i:06d}"] = {
             "vector": entry["vector"].tolist() if isinstance(entry["vector"], np.ndarray) else entry["vector"],
+            "meta":   entry["meta"]
         }
     fb_set("data_api/vectors", fb_data)
+def save_exam(sid, exam_data):
+    safe = re.sub(r'[.\[\]#$/]', '_', exam_data["meta"]["paperId"])
+    fb_set(f"data_api/exams/{sid}/{safe}", exam_data)
+# ---------------------------------------------------------------------------
+# INDEX BUILDER
+# ---------------------------------------------------------------------------
 def build_index():
     global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
+    logger.info("Full index build starting ...")
     parsed_data = []
     if os.path.exists(SYLLABI_DIR):
+        for root, _, files in os.walk(SYLLABI_DIR):
+            for f in sorted(files):
+                if not f.endswith(".pdf"): continue
+                path = os.path.join(root, f)
+                logger.info(f"Syllabus: {path}")
+                try:
+                    parser = PDFParser(path)
+                    data   = parser.parse()
+                    parsed_data.append(data)
+                    SYLLABUS_MAP[data["meta"]["id"]] = data
+                    save_syllabus(data["meta"]["id"], data)
+                except Exception as e:
+                    logger.error(f"{path}: {e}")
     if os.path.exists(PAST_EXAMS_DIR):
+        for root, _, files in os.walk(PAST_EXAMS_DIR):
+            for f in sorted(files):
+                if not f.endswith(".pdf"): continue
+                path = os.path.join(root, f)
+                logger.info(f"Exam: {path}")
+                try:
+                    parser    = ExamPaperParser(path)
+                    exam_data = parser.parse()
+                    sid       = exam_data["meta"]["subjectId"]
+                    if sid not in EXAM_MAP: EXAM_MAP[sid] = {}
+                    safe      = re.sub(r'[.\[\]#$/]', '_', exam_data["meta"]["paperId"])
+                    EXAM_MAP[sid][safe] = exam_data
+                    save_exam(sid, exam_data)
+                except Exception as e:
+                    logger.error(f"{path}: {e}")
     if not parsed_data:
+        logger.info("Nothing to vectorize.")
         return
+    chunks, metas = [], []
     for item in parsed_data:
+        mb = item["meta"]
         for topic in item["tree"]:
             for sub in topic.get("children", []):
+                blob = "\n".join(sub.get("content", []))
+                if len(blob) < 10: continue
+                chunks.append(f"{mb['subject']} {mb['level']} - {topic['title']} - {sub['title']}:\n{blob}")
+                metas.append({
+                    "subject_id":  mb["id"],
+                    "topic_id":    topic["id"],
                     "subtopic_id": sub["id"],
+                    "title":       sub["title"],
+                    "content":     blob
                 })
+    logger.info(f"Embedding {len(chunks)} chunks ...")
+    vecs = generate_embeddings(chunks)
     VECTOR_DB = []
+    valid     = []
+    for i, v in enumerate(vecs):
+        nv = np.array(v)
+        VECTOR_DB.append({"vector": nv, "meta": metas[i]})
+        valid.append(nv)
+    if valid:
+        VECTOR_MATRIX = np.vstack(valid)
+    save_all_vectors()
+    logger.info(f"Index done: {len(SYLLABUS_MAP)} syllabi, {len(VECTOR_DB)} vectors.")
+def _incremental_vectorize(syllabus_data):
+    global VECTOR_DB, VECTOR_MATRIX
+    mb     = syllabus_data["meta"]
+    chunks, metas = [], []
+    for topic in syllabus_data["tree"]:
+        for sub in topic.get("children", []):
+            blob = "\n".join(sub.get("content", []))
+            if len(blob) < 10: continue
+            chunks.append(f"{mb['subject']} {mb['level']} - {topic['title']} - {sub['title']}:\n{blob}")
+            metas.append({
+                "subject_id":  mb["id"],
+                "topic_id":    topic["id"],
+                "subtopic_id": sub["id"],
+                "title":       sub["title"],
+                "content":     blob
+            })
+    if not chunks: return
+    for i, v in enumerate(generate_embeddings(chunks)):
+        VECTOR_DB.append({"vector": np.array(v), "meta": metas[i]})
+    if VECTOR_DB:
+        VECTOR_MATRIX = np.vstack([e["vector"] for e in VECTOR_DB])
+    save_all_vectors()
+# ---------------------------------------------------------------------------
+# WATCHER
+# ---------------------------------------------------------------------------
 _indexed_files = set()
+def _collect_existing():
     for d in [SYLLABI_DIR, PAST_EXAMS_DIR]:
+        if not os.path.exists(d): continue
         for root, _, files in os.walk(d):
             for f in files:
                 if f.endswith(".pdf"):
                     _indexed_files.add(os.path.join(root, f))
+def _watch(interval=30):
     while True:
         time.sleep(interval)
         for directory, is_exam in [(SYLLABI_DIR, False), (PAST_EXAMS_DIR, True)]:
+            if not os.path.exists(directory): continue
             for root, _, files in os.walk(directory):
+                for f in files:
+                    if not f.endswith(".pdf"): continue
+                    path = os.path.join(root, f)
+                    if path in _indexed_files: continue
                     _indexed_files.add(path)
+                    logger.info(f"New PDF: {path}")
                     try:
                         if is_exam:
+                            parser    = ExamPaperParser(path)
                             exam_data = parser.parse()
+                            sid       = exam_data["meta"]["subjectId"]
+                            if sid not in EXAM_MAP: EXAM_MAP[sid] = {}
+                            safe      = re.sub(r'[.\[\]#$/]', '_', exam_data["meta"]["paperId"])
+                            EXAM_MAP[sid][safe] = exam_data
+                            save_exam(sid, exam_data)
                         else:
                             parser = PDFParser(path)
+                            data   = parser.parse()
                             SYLLABUS_MAP[data["meta"]["id"]] = data
+                            save_syllabus(data["meta"]["id"], data)
                             _incremental_vectorize(data)
                     except Exception as e:
+                        logger.error(f"Watch {path}: {e}")
+# ---------------------------------------------------------------------------
+# API
+# ---------------------------------------------------------------------------
 @app.route('/health', methods=['GET'])
 def health():
     return jsonify({
+        "status":          "online",
         "subjects_loaded": list(SYLLABUS_MAP.keys()),
+        "subject_count":   len(SYLLABUS_MAP),
+        "vector_chunks":   len(VECTOR_DB),
+        "exam_subjects":   list(EXAM_MAP.keys()),
+        "firebase":        FIREBASE_AVAILABLE,
+        "registered_subjects": ALL_SUBJECTS
     })
+@app.route('/v1/subjects', methods=['GET'])
+def list_subjects():
+    result = []
+    for sid, data in SYLLABUS_MAP.items():
+        result.append({**data.get("meta", {"id": sid}), "indexed": True})
+    for uid, name in ALL_SUBJECTS.items():
+        if uid not in SYLLABUS_MAP:
+            level = "A" if uid.startswith("A_") else "O"
+            result.append({"id": uid, "subject": name, "code": uid.split("_")[1],
+                           "level": level, "indexed": False})
+    return jsonify(result)
 @app.route('/v1/structure/<subject_id>', methods=['GET'])
 def get_structure(subject_id):
     data = SYLLABUS_MAP.get(subject_id)
     if not data:
         return jsonify({"error": "Subject not found"}), 404
     return jsonify(data)
 @app.route('/v1/search', methods=['POST'])
 def search():
+    if VECTOR_MATRIX is None or not VECTOR_DB:
         return jsonify({"error": "Index not ready"}), 503
+    req  = request.json or {}
+    q    = req.get("query")
+    sf   = req.get("filter_subject_id")
+    if not q:
         return jsonify({"error": "Query required"}), 400
+    c = get_gemini()
+    if c is None:
         return jsonify({"error": "Embedding API not configured"}), 503
     try:
+        resp = c.models.embed_content(model=EMBEDDING_MODEL, contents=q)
+        qv   = np.array(resp.embeddings[0].values).reshape(1, -1)
     except Exception as e:
         return jsonify({"error": str(e)}), 500
+    scores  = cosine_similarity(qv, VECTOR_MATRIX)[0]
     results = []
+    for idx in np.argsort(scores)[::-1]:
+        if scores[idx] < 0.3: break
+        meta = VECTOR_DB[idx]["meta"]
+        if sf and meta["subject_id"] != sf: continue
+        results.append({"score": float(scores[idx]), "subject_id": meta["subject_id"],
+                        "title": meta["title"], "content": meta["content"],
+                        "node_id": meta["subtopic_id"]})
+        if len(results) >= 5: break
     return jsonify({"results": results})
 @app.route('/v1/exams', methods=['GET'])
 def list_exams():
+    sid = request.args.get("subject_id")
+    out = []
+    for s, papers in EXAM_MAP.items():
+        if sid and s != sid: continue
+        for p in papers.values():
+            if isinstance(p, dict) and "meta" in p:
+                out.append(p["meta"])
+    return jsonify(out)
 @app.route('/v1/exams/<paper_id>', methods=['GET'])
 def get_exam(paper_id):
+    safe = re.sub(r'[.\[\]#$/]', '_', paper_id)
+    for _, papers in EXAM_MAP.items():
         for key, paper in papers.items():
+            if key == safe or (isinstance(paper, dict) and paper.get("meta", {}).get("paperId") == paper_id):
                 return jsonify(paper)
+    return jsonify({"error": "Not found"}), 404
 @app.route('/v1/exams/<paper_id>/questions', methods=['GET'])
 def get_exam_questions(paper_id):
+    safe = re.sub(r'[.\[\]#$/]', '_', paper_id)
+    for _, papers in EXAM_MAP.items():
         for key, paper in papers.items():
+            if key == safe or (isinstance(paper, dict) and paper.get("meta", {}).get("paperId") == paper_id):
+                return jsonify({"paperId": paper_id, "meta": paper.get("meta"), "questions": paper.get("questions", [])})
+    return jsonify({"error": "Not found"}), 404
 @app.route('/v1/rebuild', methods=['POST'])
 def trigger_rebuild():
+    secret = os.environ.get("REBUILD_SECRET", "")
+    if secret and request.headers.get("Authorization", "") != f"Bearer {secret}":
         return jsonify({"error": "Unauthorized"}), 401
+    def _bg():
         global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX, EXAM_MAP
+        SYLLABUS_MAP = {}; VECTOR_DB = []; VECTOR_MATRIX = None; EXAM_MAP = {}
         build_index()
+    threading.Thread(target=_bg, daemon=True).start()
     return jsonify({"status": "rebuild started"}), 202
+# ---------------------------------------------------------------------------
+# STARTUP
+# ---------------------------------------------------------------------------
 def start_app():
     for d in [SYLLABI_DIR, PAST_EXAMS_DIR]:
         if not os.path.exists(d):
             os.makedirs(os.path.join(d, "A"), exist_ok=True)
             os.makedirs(os.path.join(d, "O"), exist_ok=True)
+    if not load_index_from_firebase():
         build_index()
     else:
+        logger.info("Served from Firebase cache.")
+    _collect_existing()
+    threading.Thread(target=_watch, daemon=True).start()
+    logger.info("Watcher started.")
 with app.app_context():
     start_app()