Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import logging | |
| import re | |
| import time | |
| import numpy as np | |
| import fitz # PyMuPDF | |
| from flask import Flask, request, jsonify | |
| from flask_cors import CORS | |
| from google import genai | |
| from google.genai import types | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # --- CONFIGURATION --- | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| # Directory where your PDFs live (e.g., ./syllabi/A/Physics.pdf) | |
| SYLLABI_DIR = "syllabi" | |
| INDEX_FILE = "syllabus_index.json" # Local cache file | |
| # Google GenAI Config | |
| GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY") | |
| EMBEDDING_MODEL = "models/text-embedding-004" | |
| # --- GLOBAL STATE (IN-MEMORY) --- | |
| # Structure: { "A_9706": { "title": "Accounting", "tree": [...] }, ... } | |
| SYLLABUS_MAP = {} | |
| # Structure: [ { "id": "...", "vector": [...], "text": "...", "meta": {...} } ] | |
| VECTOR_DB = [] | |
| VECTOR_MATRIX = None # Numpy array for fast math | |
| app = Flask(__name__) | |
| CORS(app) | |
| # ----------------------------------------------------------------------------- | |
| # 1. THE PARSER ENGINE (Extracts Structure from PDF) | |
| # ----------------------------------------------------------------------------- | |
| class PDFParser: | |
| def __init__(self, filepath): | |
| self.filepath = filepath | |
| self.filename = os.path.basename(filepath) | |
| self.doc = fitz.open(filepath) | |
| # Determine Subject and Level from filename/path | |
| # Expected: syllabi/A/Accounting_9706.pdf | |
| parts = filepath.split(os.sep) | |
| self.level = parts[-2] if len(parts) > 1 else "General" | |
| # Extract code if present (e.g., 9618) | |
| self.subject_code = re.search(r'\d{4}', self.filename) | |
| self.subject_code = self.subject_code.group(0) if self.subject_code else "0000" | |
| self.subject_name = self.filename.split('_')[0] | |
| self.unique_id = f"{self.level}_{self.subject_code}" | |
| def get_font_characteristics(self): | |
| """Scans PDF to find the most common font size (body text).""" | |
| font_sizes = {} | |
| for page in self.doc: | |
| blocks = page.get_text("dict")["blocks"] | |
| for b in blocks: | |
| for l in b.get("lines", []): | |
| for s in l.get("spans", []): | |
| size = round(s["size"], 1) | |
| font_sizes[size] = font_sizes.get(size, 0) + len(s["text"]) | |
| # The font size with the most characters is likely the "Body Text" | |
| if not font_sizes: return 10.0 | |
| return max(font_sizes, key=font_sizes.get) | |
| def parse(self): | |
| """ | |
| Heuristic parsing: | |
| - Text significantly larger than body = Topic | |
| - Bold text slightly larger than body = Subtopic | |
| - Body text = Content/Objectives | |
| """ | |
| body_size = self.get_font_characteristics() | |
| logger.info(f"Parsing {self.filename} (Body size approx {body_size}pt)") | |
| syllabus_tree = [] | |
| current_topic = None | |
| current_subtopic = None | |
| # Regex to detect "Topic 1" or "1.1" or "Key Question" | |
| topic_pattern = re.compile(r'^(\d+\.?\s|Key Question\s)', re.IGNORECASE) | |
| for page in self.doc: | |
| blocks = page.get_text("dict")["blocks"] | |
| for b in blocks: | |
| block_text = "" | |
| max_size = 0 | |
| is_bold = False | |
| # Reconstruct line text and finding max font style | |
| for l in b.get("lines", []): | |
| for s in l.get("spans", []): | |
| text = s["text"].strip() | |
| if not text: continue | |
| block_text += text + " " | |
| if s["size"] > max_size: max_size = s["size"] | |
| if "bold" in s["font"].lower(): is_bold = True | |
| block_text = block_text.strip() | |
| if len(block_text) < 3: continue # Skip noise | |
| # HEURISTIC 1: TOPIC (Large Header) | |
| # Usually 2pt+ larger than body | |
| if max_size > body_size + 2: | |
| # Save previous | |
| if current_subtopic and current_topic: | |
| current_topic["children"].append(current_subtopic) | |
| current_subtopic = None | |
| if current_topic: | |
| syllabus_tree.append(current_topic) | |
| current_topic = { | |
| "id": f"{self.unique_id}_{len(syllabus_tree)}", | |
| "title": block_text, | |
| "type": "topic", | |
| "children": [] | |
| } | |
| current_subtopic = None | |
| # HEURISTIC 2: SUBTOPIC (Bold, slightly larger or same size as body) | |
| # Must start with number or specific keyword to reduce noise | |
| elif (is_bold and max_size >= body_size) or (topic_pattern.match(block_text) and max_size >= body_size): | |
| if current_subtopic and current_topic: | |
| current_topic["children"].append(current_subtopic) | |
| # If no topic exists yet, create a dummy one | |
| if not current_topic: | |
| current_topic = {"id": f"{self.unique_id}_root", "title": "Syllabus Overview", "type": "topic", "children": []} | |
| current_subtopic = { | |
| "id": f"{current_topic['id']}_{len(current_topic['children'])}", | |
| "title": block_text, | |
| "type": "subtopic", | |
| "content": [] | |
| } | |
| # HEURISTIC 3: CONTENT (Body Text) | |
| elif max_size <= body_size + 1: | |
| if current_subtopic: | |
| current_subtopic["content"].append(block_text) | |
| elif current_topic: | |
| # Sometimes text appears directly under a topic | |
| # Create implicit subtopic | |
| current_subtopic = { | |
| "id": f"{current_topic['id']}_intro", | |
| "title": "Introduction / Overview", | |
| "type": "subtopic", | |
| "content": [block_text] | |
| } | |
| # Flush remainders | |
| if current_subtopic and current_topic: | |
| current_topic["children"].append(current_subtopic) | |
| if current_topic: | |
| syllabus_tree.append(current_topic) | |
| return { | |
| "meta": { | |
| "id": self.unique_id, | |
| "subject": self.subject_name, | |
| "code": self.subject_code, | |
| "level": self.level | |
| }, | |
| "tree": syllabus_tree | |
| } | |
| # ----------------------------------------------------------------------------- | |
| # 2. THE VECTOR ENGINE (Embeddings & Search) | |
| # ----------------------------------------------------------------------------- | |
| def generate_embeddings(texts): | |
| """Generates embeddings using Gemini API (Batching recommended for production).""" | |
| if not GEMINI_API_KEY: | |
| logger.warning("No Gemini API Key found. Skipping embeddings.") | |
| return [np.zeros(768) for _ in texts] # Dummy vectors | |
| client = genai.Client(api_key=GEMINI_API_KEY) | |
| results = [] | |
| # Simple batching to avoid hitting limits | |
| batch_size = 10 | |
| for i in range(0, len(texts), batch_size): | |
| batch = texts[i:i+batch_size] | |
| try: | |
| resp = client.models.embed_content( | |
| model=EMBEDDING_MODEL, | |
| contents=batch, | |
| ) | |
| # Handle list of embeddings | |
| for embedding in resp.embeddings: | |
| results.append(np.array(embedding.values)) | |
| except Exception as e: | |
| logger.error(f"Embedding failed: {e}") | |
| # Fallback for failed batch | |
| for _ in batch: results.append(np.zeros(768)) | |
| return results | |
| def build_index(): | |
| """Walks the directory, parses PDFs, builds JSON tree and Vector Index.""" | |
| global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX | |
| logger.info("🚀 Starting Build Process...") | |
| # 1. Walk Directory | |
| if not os.path.exists(SYLLABI_DIR): | |
| logger.error(f"Directory {SYLLABI_DIR} not found.") | |
| return | |
| parsed_data = [] | |
| for root, dirs, files in os.walk(SYLLABI_DIR): | |
| for file in files: | |
| if file.endswith(".pdf"): | |
| path = os.path.join(root, file) | |
| parser = PDFParser(path) | |
| data = parser.parse() | |
| parsed_data.append(data) | |
| # Store in Map | |
| SYLLABUS_MAP[data["meta"]["id"]] = data | |
| # 2. Flatten for Vectorization | |
| chunks_to_embed = [] | |
| chunk_metadata = [] | |
| for item in parsed_data: | |
| meta_base = item["meta"] | |
| for topic in item["tree"]: | |
| for sub in topic["children"]: | |
| # Create a rich semantic chunk | |
| # Format: "Subject Level - Topic - Subtopic: Content" | |
| text_blob = "\n".join(sub["content"]) | |
| if len(text_blob) < 10: continue # Skip empty chunks | |
| rich_text = f"{meta_base['subject']} {meta_base['level']} - {topic['title']} - {sub['title']}:\n{text_blob}" | |
| chunks_to_embed.append(rich_text) | |
| chunk_metadata.append({ | |
| "subject_id": meta_base["id"], | |
| "topic_id": topic["id"], | |
| "subtopic_id": sub["id"], | |
| "title": sub["title"], | |
| "content": text_blob | |
| }) | |
| # 3. Generate Embeddings | |
| logger.info(f"🧮 Generating embeddings for {len(chunks_to_embed)} chunks...") | |
| vectors = generate_embeddings(chunks_to_embed) | |
| # 4. Populate Global DB | |
| VECTOR_DB = [] | |
| valid_vectors = [] | |
| for i, vec in enumerate(vectors): | |
| VECTOR_DB.append({ | |
| "vector": vec, # Keep for debug/individual access | |
| "meta": chunk_metadata[i] | |
| }) | |
| valid_vectors.append(vec) | |
| if valid_vectors: | |
| VECTOR_MATRIX = np.vstack(valid_vectors) | |
| logger.info("✅ Indexing Complete.") | |
| # ----------------------------------------------------------------------------- | |
| # 3. API SERVER (The Retrieval Layer) | |
| # ----------------------------------------------------------------------------- | |
| def health(): | |
| return jsonify({"status": "online", "subjects_loaded": list(SYLLABUS_MAP.keys())}) | |
| def get_structure(subject_id): | |
| """Returns the static JSON tree for navigation UI.""" | |
| data = SYLLABUS_MAP.get(subject_id) | |
| if not data: | |
| return jsonify({"error": "Subject not found"}), 404 | |
| return jsonify(data) | |
| def search(): | |
| """ | |
| Semantic Retrieval. | |
| Input: { "query": "...", "filter_subject_id": "..." (optional) } | |
| """ | |
| if VECTOR_MATRIX is None: | |
| return jsonify({"error": "Index not ready"}), 503 | |
| data = request.json | |
| query = data.get("query") | |
| subject_filter = data.get("filter_subject_id") | |
| if not query: | |
| return jsonify({"error": "Query required"}), 400 | |
| # 1. Embed Query | |
| client = genai.Client(api_key=GEMINI_API_KEY) | |
| try: | |
| resp = client.models.embed_content(model=EMBEDDING_MODEL, contents=query) | |
| query_vec = np.array(resp.embeddings[0].values).reshape(1, -1) | |
| except Exception as e: | |
| return jsonify({"error": str(e)}), 500 | |
| # 2. Vector Search (Cosine Similarity) | |
| # scores shape: (1, N_chunks) | |
| scores = cosine_similarity(query_vec, VECTOR_MATRIX)[0] | |
| # 3. Filter and Sort | |
| results = [] | |
| # Get top 10 indices | |
| top_indices = np.argsort(scores)[::-1] | |
| count = 0 | |
| for idx in top_indices: | |
| if scores[idx] < 0.3: break # Threshold cutoff | |
| entry = VECTOR_DB[idx] | |
| meta = entry["meta"] | |
| # Apply Filter | |
| if subject_filter and meta["subject_id"] != subject_filter: | |
| continue | |
| results.append({ | |
| "score": float(scores[idx]), | |
| "subject_id": meta["subject_id"], | |
| "title": meta["title"], | |
| "content": meta["content"], # Raw text chunk | |
| "node_id": meta["subtopic_id"] # Pointer to the structure tree | |
| }) | |
| count += 1 | |
| if count >= 5: break # Limit to top 5 | |
| return jsonify({"results": results}) | |
| # ----------------------------------------------------------------------------- | |
| # 4. STARTUP BOOTSTRAP | |
| # ----------------------------------------------------------------------------- | |
| def start_app(): | |
| # In a real deployment, we might load from disk here. | |
| # For now, we rebuild on boot. | |
| if not os.path.exists(SYLLABI_DIR): | |
| os.makedirs(os.path.join(SYLLABI_DIR, "A"), exist_ok=True) | |
| os.makedirs(os.path.join(SYLLABI_DIR, "O"), exist_ok=True) | |
| logger.warning(f"Created empty {SYLLABI_DIR}. Please add PDFs.") | |
| # Run Indexer | |
| build_index() | |
| # Run the builder once on import (or server start) | |
| with app.app_context(): | |
| start_app() | |
| if __name__ == '__main__': | |
| # Use 7860 for HF Spaces | |
| app.run(host='0.0.0.0', port=7860) |