Spaces:

rairo
/

marka-data-api

Sleeping

App Files Files Community

rairo commited on Mar 2

Commit

3065188

verified ·

1 Parent(s): 5f73dfd

Create main.py

Browse files

Files changed (1) hide show

main.py +367 -0

main.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import os
+import json
+import logging
+import re
+import time
+import numpy as np
+import fitz  # PyMuPDF
+from flask import Flask, request, jsonify
+from flask_cors import CORS
+from google import genai
+from google.genai import types
+from sklearn.metrics.pairwise import cosine_similarity
+# --- CONFIGURATION ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Directory where your PDFs live (e.g., ./syllabi/A/Physics.pdf)
+SYLLABI_DIR = "syllabi"
+INDEX_FILE = "syllabus_index.json" # Local cache file
+# Google GenAI Config
+GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
+EMBEDDING_MODEL = "models/text-embedding-004"
+# --- GLOBAL STATE (IN-MEMORY) ---
+# Structure: { "A_9706": { "title": "Accounting", "tree": [...] }, ... }
+SYLLABUS_MAP = {}
+# Structure: [ { "id": "...", "vector": [...], "text": "...", "meta": {...} } ]
+VECTOR_DB = []
+VECTOR_MATRIX = None # Numpy array for fast math
+app = Flask(__name__)
+CORS(app)
+# -----------------------------------------------------------------------------
+# 1. THE PARSER ENGINE (Extracts Structure from PDF)
+# -----------------------------------------------------------------------------
+class PDFParser:
+    def __init__(self, filepath):
+        self.filepath = filepath
+        self.filename = os.path.basename(filepath)
+        self.doc = fitz.open(filepath)
+        # Determine Subject and Level from filename/path
+        # Expected: syllabi/A/Accounting_9706.pdf
+        parts = filepath.split(os.sep)
+        self.level = parts[-2] if len(parts) > 1 else "General"
+        # Extract code if present (e.g., 9618)
+        self.subject_code = re.search(r'\d{4}', self.filename)
+        self.subject_code = self.subject_code.group(0) if self.subject_code else "0000"
+        self.subject_name = self.filename.split('_')[0]
+        self.unique_id = f"{self.level}_{self.subject_code}"
+    def get_font_characteristics(self):
+        """Scans PDF to find the most common font size (body text)."""
+        font_sizes = {}
+        for page in self.doc:
+            blocks = page.get_text("dict")["blocks"]
+            for b in blocks:
+                for l in b.get("lines", []):
+                    for s in l.get("spans", []):
+                        size = round(s["size"], 1)
+                        font_sizes[size] = font_sizes.get(size, 0) + len(s["text"])
+        # The font size with the most characters is likely the "Body Text"
+        if not font_sizes: return 10.0
+        return max(font_sizes, key=font_sizes.get)
+    def parse(self):
+        """
+        Heuristic parsing:
+        - Text significantly larger than body = Topic
+        - Bold text slightly larger than body = Subtopic
+        - Body text = Content/Objectives
+        """
+        body_size = self.get_font_characteristics()
+        logger.info(f"Parsing {self.filename} (Body size approx {body_size}pt)")
+        syllabus_tree = []
+        current_topic = None
+        current_subtopic = None
+        # Regex to detect "Topic 1" or "1.1" or "Key Question"
+        topic_pattern = re.compile(r'^(\d+\.?\s|Key Question\s)', re.IGNORECASE)
+        for page in self.doc:
+            blocks = page.get_text("dict")["blocks"]
+            for b in blocks:
+                block_text = ""
+                max_size = 0
+                is_bold = False
+                # Reconstruct line text and finding max font style
+                for l in b.get("lines", []):
+                    for s in l.get("spans", []):
+                        text = s["text"].strip()
+                        if not text: continue
+                        block_text += text + " "
+                        if s["size"] > max_size: max_size = s["size"]
+                        if "bold" in s["font"].lower(): is_bold = True
+                block_text = block_text.strip()
+                if len(block_text) < 3: continue # Skip noise
+                # HEURISTIC 1: TOPIC (Large Header)
+                # Usually 2pt+ larger than body
+                if max_size > body_size + 2:
+                    # Save previous
+                    if current_subtopic and current_topic:
+                        current_topic["children"].append(current_subtopic)
+                        current_subtopic = None
+                    if current_topic:
+                        syllabus_tree.append(current_topic)
+                    current_topic = {
+                        "id": f"{self.unique_id}_{len(syllabus_tree)}",
+                        "title": block_text,
+                        "type": "topic",
+                        "children": []
+                    }
+                    current_subtopic = None
+                # HEURISTIC 2: SUBTOPIC (Bold, slightly larger or same size as body)
+                # Must start with number or specific keyword to reduce noise
+                elif (is_bold and max_size >= body_size) or (topic_pattern.match(block_text) and max_size >= body_size):
+                    if current_subtopic and current_topic:
+                        current_topic["children"].append(current_subtopic)
+                    # If no topic exists yet, create a dummy one
+                    if not current_topic:
+                        current_topic = {"id": f"{self.unique_id}_root", "title": "Syllabus Overview", "type": "topic", "children": []}
+                    current_subtopic = {
+                        "id": f"{current_topic['id']}_{len(current_topic['children'])}",
+                        "title": block_text,
+                        "type": "subtopic",
+                        "content": []
+                    }
+                # HEURISTIC 3: CONTENT (Body Text)
+                elif max_size <= body_size + 1:
+                    if current_subtopic:
+                        current_subtopic["content"].append(block_text)
+                    elif current_topic:
+                        # Sometimes text appears directly under a topic
+                        # Create implicit subtopic
+                        current_subtopic = {
+                            "id": f"{current_topic['id']}_intro",
+                            "title": "Introduction / Overview",
+                            "type": "subtopic",
+                            "content": [block_text]
+                        }
+        # Flush remainders
+        if current_subtopic and current_topic:
+            current_topic["children"].append(current_subtopic)
+        if current_topic:
+            syllabus_tree.append(current_topic)
+        return {
+            "meta": {
+                "id": self.unique_id,
+                "subject": self.subject_name,
+                "code": self.subject_code,
+                "level": self.level
+            },
+            "tree": syllabus_tree
+        }
+# -----------------------------------------------------------------------------
+# 2. THE VECTOR ENGINE (Embeddings & Search)
+# -----------------------------------------------------------------------------
+def generate_embeddings(texts):
+    """Generates embeddings using Gemini API (Batching recommended for production)."""
+    if not GEMINI_API_KEY:
+        logger.warning("No Gemini API Key found. Skipping embeddings.")
+        return [np.zeros(768) for _ in texts] # Dummy vectors
+    client = genai.Client(api_key=GEMINI_API_KEY)
+    results = []
+    # Simple batching to avoid hitting limits
+    batch_size = 10
+    for i in range(0, len(texts), batch_size):
+        batch = texts[i:i+batch_size]
+        try:
+            resp = client.models.embed_content(
+                model=EMBEDDING_MODEL,
+                contents=batch,
+            )
+            # Handle list of embeddings
+            for embedding in resp.embeddings:
+                results.append(np.array(embedding.values))
+        except Exception as e:
+            logger.error(f"Embedding failed: {e}")
+            # Fallback for failed batch
+            for _ in batch: results.append(np.zeros(768))
+    return results
+def build_index():
+    """Walks the directory, parses PDFs, builds JSON tree and Vector Index."""
+    global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX
+    logger.info("🚀 Starting Build Process...")
+    # 1. Walk Directory
+    if not os.path.exists(SYLLABI_DIR):
+        logger.error(f"Directory {SYLLABI_DIR} not found.")
+        return
+    parsed_data = []
+    for root, dirs, files in os.walk(SYLLABI_DIR):
+        for file in files:
+            if file.endswith(".pdf"):
+                path = os.path.join(root, file)
+                parser = PDFParser(path)
+                data = parser.parse()
+                parsed_data.append(data)
+                # Store in Map
+                SYLLABUS_MAP[data["meta"]["id"]] = data
+    # 2. Flatten for Vectorization
+    chunks_to_embed = []
+    chunk_metadata = []
+    for item in parsed_data:
+        meta_base = item["meta"]
+        for topic in item["tree"]:
+            for sub in topic["children"]:
+                # Create a rich semantic chunk
+                # Format: "Subject Level - Topic - Subtopic: Content"
+                text_blob = "\n".join(sub["content"])
+                if len(text_blob) < 10: continue # Skip empty chunks
+                rich_text = f"{meta_base['subject']} {meta_base['level']} - {topic['title']} - {sub['title']}:\n{text_blob}"
+                chunks_to_embed.append(rich_text)
+                chunk_metadata.append({
+                    "subject_id": meta_base["id"],
+                    "topic_id": topic["id"],
+                    "subtopic_id": sub["id"],
+                    "title": sub["title"],
+                    "content": text_blob
+                })
+    # 3. Generate Embeddings
+    logger.info(f"🧮 Generating embeddings for {len(chunks_to_embed)} chunks...")
+    vectors = generate_embeddings(chunks_to_embed)
+    # 4. Populate Global DB
+    VECTOR_DB = []
+    valid_vectors = []
+    for i, vec in enumerate(vectors):
+        VECTOR_DB.append({
+            "vector": vec, # Keep for debug/individual access
+            "meta": chunk_metadata[i]
+        })
+        valid_vectors.append(vec)
+    if valid_vectors:
+        VECTOR_MATRIX = np.vstack(valid_vectors)
+    logger.info("✅ Indexing Complete.")
+# -----------------------------------------------------------------------------
+# 3. API SERVER (The Retrieval Layer)
+# -----------------------------------------------------------------------------
+@app.route('/health', methods=['GET'])
+def health():
+    return jsonify({"status": "online", "subjects_loaded": list(SYLLABUS_MAP.keys())})
+@app.route('/v1/structure/<subject_id>', methods=['GET'])
+def get_structure(subject_id):
+    """Returns the static JSON tree for navigation UI."""
+    data = SYLLABUS_MAP.get(subject_id)
+    if not data:
+        return jsonify({"error": "Subject not found"}), 404
+    return jsonify(data)
+@app.route('/v1/search', methods=['POST'])
+def search():
+    """
+    Semantic Retrieval.
+    Input: { "query": "...", "filter_subject_id": "..." (optional) }
+    """
+    if VECTOR_MATRIX is None:
+        return jsonify({"error": "Index not ready"}), 503
+    data = request.json
+    query = data.get("query")
+    subject_filter = data.get("filter_subject_id")
+    if not query:
+        return jsonify({"error": "Query required"}), 400
+    # 1. Embed Query
+    client = genai.Client(api_key=GEMINI_API_KEY)
+    try:
+        resp = client.models.embed_content(model=EMBEDDING_MODEL, contents=query)
+        query_vec = np.array(resp.embeddings[0].values).reshape(1, -1)
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+    # 2. Vector Search (Cosine Similarity)
+    # scores shape: (1, N_chunks)
+    scores = cosine_similarity(query_vec, VECTOR_MATRIX)[0]
+    # 3. Filter and Sort
+    results = []
+    # Get top 10 indices
+    top_indices = np.argsort(scores)[::-1]
+    count = 0
+    for idx in top_indices:
+        if scores[idx] < 0.3: break # Threshold cutoff
+        entry = VECTOR_DB[idx]
+        meta = entry["meta"]
+        # Apply Filter
+        if subject_filter and meta["subject_id"] != subject_filter:
+            continue
+        results.append({
+            "score": float(scores[idx]),
+            "subject_id": meta["subject_id"],
+            "title": meta["title"],
+            "content": meta["content"], # Raw text chunk
+            "node_id": meta["subtopic_id"] # Pointer to the structure tree
+        })
+        count += 1
+        if count >= 5: break # Limit to top 5
+    return jsonify({"results": results})
+# -----------------------------------------------------------------------------
+# 4. STARTUP BOOTSTRAP
+# -----------------------------------------------------------------------------
+def start_app():
+    # In a real deployment, we might load from disk here.
+    # For now, we rebuild on boot.
+    if not os.path.exists(SYLLABI_DIR):
+        os.makedirs(os.path.join(SYLLABI_DIR, "A"), exist_ok=True)
+        os.makedirs(os.path.join(SYLLABI_DIR, "O"), exist_ok=True)
+        logger.warning(f"Created empty {SYLLABI_DIR}. Please add PDFs.")
+    # Run Indexer
+    build_index()
+# Run the builder once on import (or server start)
+with app.app_context():
+    start_app()
+if __name__ == '__main__':
+    # Use 7860 for HF Spaces
+    app.run(host='0.0.0.0', port=7860)