import os import json import logging import re import time import numpy as np import fitz # PyMuPDF from flask import Flask, request, jsonify from flask_cors import CORS from google import genai from google.genai import types from sklearn.metrics.pairwise import cosine_similarity # --- CONFIGURATION --- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Directory where your PDFs live (e.g., ./syllabi/A/Physics.pdf) SYLLABI_DIR = "syllabi" INDEX_FILE = "syllabus_index.json" # Local cache file # Google GenAI Config GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY") EMBEDDING_MODEL = "models/text-embedding-004" # --- GLOBAL STATE (IN-MEMORY) --- # Structure: { "A_9706": { "title": "Accounting", "tree": [...] }, ... } SYLLABUS_MAP = {} # Structure: [ { "id": "...", "vector": [...], "text": "...", "meta": {...} } ] VECTOR_DB = [] VECTOR_MATRIX = None # Numpy array for fast math app = Flask(__name__) CORS(app) # ----------------------------------------------------------------------------- # 1. THE PARSER ENGINE (Extracts Structure from PDF) # ----------------------------------------------------------------------------- class PDFParser: def __init__(self, filepath): self.filepath = filepath self.filename = os.path.basename(filepath) self.doc = fitz.open(filepath) # Determine Subject and Level from filename/path # Expected: syllabi/A/Accounting_9706.pdf parts = filepath.split(os.sep) self.level = parts[-2] if len(parts) > 1 else "General" # Extract code if present (e.g., 9618) self.subject_code = re.search(r'\d{4}', self.filename) self.subject_code = self.subject_code.group(0) if self.subject_code else "0000" self.subject_name = self.filename.split('_')[0] self.unique_id = f"{self.level}_{self.subject_code}" def get_font_characteristics(self): """Scans PDF to find the most common font size (body text).""" font_sizes = {} for page in self.doc: blocks = page.get_text("dict")["blocks"] for b in blocks: for l in b.get("lines", []): for s in l.get("spans", []): size = round(s["size"], 1) font_sizes[size] = font_sizes.get(size, 0) + len(s["text"]) # The font size with the most characters is likely the "Body Text" if not font_sizes: return 10.0 return max(font_sizes, key=font_sizes.get) def parse(self): """ Heuristic parsing: - Text significantly larger than body = Topic - Bold text slightly larger than body = Subtopic - Body text = Content/Objectives """ body_size = self.get_font_characteristics() logger.info(f"Parsing {self.filename} (Body size approx {body_size}pt)") syllabus_tree = [] current_topic = None current_subtopic = None # Regex to detect "Topic 1" or "1.1" or "Key Question" topic_pattern = re.compile(r'^(\d+\.?\s|Key Question\s)', re.IGNORECASE) for page in self.doc: blocks = page.get_text("dict")["blocks"] for b in blocks: block_text = "" max_size = 0 is_bold = False # Reconstruct line text and finding max font style for l in b.get("lines", []): for s in l.get("spans", []): text = s["text"].strip() if not text: continue block_text += text + " " if s["size"] > max_size: max_size = s["size"] if "bold" in s["font"].lower(): is_bold = True block_text = block_text.strip() if len(block_text) < 3: continue # Skip noise # HEURISTIC 1: TOPIC (Large Header) # Usually 2pt+ larger than body if max_size > body_size + 2: # Save previous if current_subtopic and current_topic: current_topic["children"].append(current_subtopic) current_subtopic = None if current_topic: syllabus_tree.append(current_topic) current_topic = { "id": f"{self.unique_id}_{len(syllabus_tree)}", "title": block_text, "type": "topic", "children": [] } current_subtopic = None # HEURISTIC 2: SUBTOPIC (Bold, slightly larger or same size as body) # Must start with number or specific keyword to reduce noise elif (is_bold and max_size >= body_size) or (topic_pattern.match(block_text) and max_size >= body_size): if current_subtopic and current_topic: current_topic["children"].append(current_subtopic) # If no topic exists yet, create a dummy one if not current_topic: current_topic = {"id": f"{self.unique_id}_root", "title": "Syllabus Overview", "type": "topic", "children": []} current_subtopic = { "id": f"{current_topic['id']}_{len(current_topic['children'])}", "title": block_text, "type": "subtopic", "content": [] } # HEURISTIC 3: CONTENT (Body Text) elif max_size <= body_size + 1: if current_subtopic: current_subtopic["content"].append(block_text) elif current_topic: # Sometimes text appears directly under a topic # Create implicit subtopic current_subtopic = { "id": f"{current_topic['id']}_intro", "title": "Introduction / Overview", "type": "subtopic", "content": [block_text] } # Flush remainders if current_subtopic and current_topic: current_topic["children"].append(current_subtopic) if current_topic: syllabus_tree.append(current_topic) return { "meta": { "id": self.unique_id, "subject": self.subject_name, "code": self.subject_code, "level": self.level }, "tree": syllabus_tree } # ----------------------------------------------------------------------------- # 2. THE VECTOR ENGINE (Embeddings & Search) # ----------------------------------------------------------------------------- def generate_embeddings(texts): """Generates embeddings using Gemini API (Batching recommended for production).""" if not GEMINI_API_KEY: logger.warning("No Gemini API Key found. Skipping embeddings.") return [np.zeros(768) for _ in texts] # Dummy vectors client = genai.Client(api_key=GEMINI_API_KEY) results = [] # Simple batching to avoid hitting limits batch_size = 10 for i in range(0, len(texts), batch_size): batch = texts[i:i+batch_size] try: resp = client.models.embed_content( model=EMBEDDING_MODEL, contents=batch, ) # Handle list of embeddings for embedding in resp.embeddings: results.append(np.array(embedding.values)) except Exception as e: logger.error(f"Embedding failed: {e}") # Fallback for failed batch for _ in batch: results.append(np.zeros(768)) return results def build_index(): """Walks the directory, parses PDFs, builds JSON tree and Vector Index.""" global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX logger.info("🚀 Starting Build Process...") # 1. Walk Directory if not os.path.exists(SYLLABI_DIR): logger.error(f"Directory {SYLLABI_DIR} not found.") return parsed_data = [] for root, dirs, files in os.walk(SYLLABI_DIR): for file in files: if file.endswith(".pdf"): path = os.path.join(root, file) parser = PDFParser(path) data = parser.parse() parsed_data.append(data) # Store in Map SYLLABUS_MAP[data["meta"]["id"]] = data # 2. Flatten for Vectorization chunks_to_embed = [] chunk_metadata = [] for item in parsed_data: meta_base = item["meta"] for topic in item["tree"]: for sub in topic["children"]: # Create a rich semantic chunk # Format: "Subject Level - Topic - Subtopic: Content" text_blob = "\n".join(sub["content"]) if len(text_blob) < 10: continue # Skip empty chunks rich_text = f"{meta_base['subject']} {meta_base['level']} - {topic['title']} - {sub['title']}:\n{text_blob}" chunks_to_embed.append(rich_text) chunk_metadata.append({ "subject_id": meta_base["id"], "topic_id": topic["id"], "subtopic_id": sub["id"], "title": sub["title"], "content": text_blob }) # 3. Generate Embeddings logger.info(f"🧮 Generating embeddings for {len(chunks_to_embed)} chunks...") vectors = generate_embeddings(chunks_to_embed) # 4. Populate Global DB VECTOR_DB = [] valid_vectors = [] for i, vec in enumerate(vectors): VECTOR_DB.append({ "vector": vec, # Keep for debug/individual access "meta": chunk_metadata[i] }) valid_vectors.append(vec) if valid_vectors: VECTOR_MATRIX = np.vstack(valid_vectors) logger.info("✅ Indexing Complete.") # ----------------------------------------------------------------------------- # 3. API SERVER (The Retrieval Layer) # ----------------------------------------------------------------------------- @app.route('/health', methods=['GET']) def health(): return jsonify({"status": "online", "subjects_loaded": list(SYLLABUS_MAP.keys())}) @app.route('/v1/structure/', methods=['GET']) def get_structure(subject_id): """Returns the static JSON tree for navigation UI.""" data = SYLLABUS_MAP.get(subject_id) if not data: return jsonify({"error": "Subject not found"}), 404 return jsonify(data) @app.route('/v1/search', methods=['POST']) def search(): """ Semantic Retrieval. Input: { "query": "...", "filter_subject_id": "..." (optional) } """ if VECTOR_MATRIX is None: return jsonify({"error": "Index not ready"}), 503 data = request.json query = data.get("query") subject_filter = data.get("filter_subject_id") if not query: return jsonify({"error": "Query required"}), 400 # 1. Embed Query client = genai.Client(api_key=GEMINI_API_KEY) try: resp = client.models.embed_content(model=EMBEDDING_MODEL, contents=query) query_vec = np.array(resp.embeddings[0].values).reshape(1, -1) except Exception as e: return jsonify({"error": str(e)}), 500 # 2. Vector Search (Cosine Similarity) # scores shape: (1, N_chunks) scores = cosine_similarity(query_vec, VECTOR_MATRIX)[0] # 3. Filter and Sort results = [] # Get top 10 indices top_indices = np.argsort(scores)[::-1] count = 0 for idx in top_indices: if scores[idx] < 0.3: break # Threshold cutoff entry = VECTOR_DB[idx] meta = entry["meta"] # Apply Filter if subject_filter and meta["subject_id"] != subject_filter: continue results.append({ "score": float(scores[idx]), "subject_id": meta["subject_id"], "title": meta["title"], "content": meta["content"], # Raw text chunk "node_id": meta["subtopic_id"] # Pointer to the structure tree }) count += 1 if count >= 5: break # Limit to top 5 return jsonify({"results": results}) # ----------------------------------------------------------------------------- # 4. STARTUP BOOTSTRAP # ----------------------------------------------------------------------------- def start_app(): # In a real deployment, we might load from disk here. # For now, we rebuild on boot. if not os.path.exists(SYLLABI_DIR): os.makedirs(os.path.join(SYLLABI_DIR, "A"), exist_ok=True) os.makedirs(os.path.join(SYLLABI_DIR, "O"), exist_ok=True) logger.warning(f"Created empty {SYLLABI_DIR}. Please add PDFs.") # Run Indexer build_index() # Run the builder once on import (or server start) with app.app_context(): start_app() if __name__ == '__main__': # Use 7860 for HF Spaces app.run(host='0.0.0.0', port=7860)