Spaces:

rairo
/

marka-data-api

Sleeping

File size: 13,492 Bytes
import os
import json
import logging
import re
import time
import numpy as np
import fitz  # PyMuPDF
from flask import Flask, request, jsonify
from flask_cors import CORS
from google import genai
from google.genai import types
from sklearn.metrics.pairwise import cosine_similarity

# --- CONFIGURATION ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Directory where your PDFs live (e.g., ./syllabi/A/Physics.pdf)
SYLLABI_DIR = "syllabi"
INDEX_FILE = "syllabus_index.json" # Local cache file

# Google GenAI Config
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
EMBEDDING_MODEL = "models/text-embedding-004"

# --- GLOBAL STATE (IN-MEMORY) ---
# Structure: { "A_9706": { "title": "Accounting", "tree": [...] }, ... }
SYLLABUS_MAP = {} 

# Structure: [ { "id": "...", "vector": [...], "text": "...", "meta": {...} } ]
VECTOR_DB = []
VECTOR_MATRIX = None # Numpy array for fast math

app = Flask(__name__)
CORS(app)

# -----------------------------------------------------------------------------
# 1. THE PARSER ENGINE (Extracts Structure from PDF)
# -----------------------------------------------------------------------------

class PDFParser:
    def __init__(self, filepath):
        self.filepath = filepath
        self.filename = os.path.basename(filepath)
        self.doc = fitz.open(filepath)
        
        # Determine Subject and Level from filename/path
        # Expected: syllabi/A/Accounting_9706.pdf
        parts = filepath.split(os.sep)
        self.level = parts[-2] if len(parts) > 1 else "General"
        # Extract code if present (e.g., 9618)
        self.subject_code = re.search(r'\d{4}', self.filename)
        self.subject_code = self.subject_code.group(0) if self.subject_code else "0000"
        self.subject_name = self.filename.split('_')[0]
        self.unique_id = f"{self.level}_{self.subject_code}"

    def get_font_characteristics(self):
        """Scans PDF to find the most common font size (body text)."""
        font_sizes = {}
        for page in self.doc:
            blocks = page.get_text("dict")["blocks"]
            for b in blocks:
                for l in b.get("lines", []):
                    for s in l.get("spans", []):
                        size = round(s["size"], 1)
                        font_sizes[size] = font_sizes.get(size, 0) + len(s["text"])
        
        # The font size with the most characters is likely the "Body Text"
        if not font_sizes: return 10.0
        return max(font_sizes, key=font_sizes.get)

    def parse(self):
        """
        Heuristic parsing:
        - Text significantly larger than body = Topic
        - Bold text slightly larger than body = Subtopic
        - Body text = Content/Objectives
        """
        body_size = self.get_font_characteristics()
        logger.info(f"Parsing {self.filename} (Body size approx {body_size}pt)")

        syllabus_tree = []
        current_topic = None
        current_subtopic = None
        
        # Regex to detect "Topic 1" or "1.1" or "Key Question"
        topic_pattern = re.compile(r'^(\d+\.?\s|Key Question\s)', re.IGNORECASE)

        for page in self.doc:
            blocks = page.get_text("dict")["blocks"]
            for b in blocks:
                block_text = ""
                max_size = 0
                is_bold = False
                
                # Reconstruct line text and finding max font style
                for l in b.get("lines", []):
                    for s in l.get("spans", []):
                        text = s["text"].strip()
                        if not text: continue
                        block_text += text + " "
                        if s["size"] > max_size: max_size = s["size"]
                        if "bold" in s["font"].lower(): is_bold = True
                
                block_text = block_text.strip()
                if len(block_text) < 3: continue # Skip noise

                # HEURISTIC 1: TOPIC (Large Header)
                # Usually 2pt+ larger than body
                if max_size > body_size + 2:
                    # Save previous
                    if current_subtopic and current_topic:
                        current_topic["children"].append(current_subtopic)
                        current_subtopic = None
                    if current_topic:
                        syllabus_tree.append(current_topic)
                    
                    current_topic = {
                        "id": f"{self.unique_id}_{len(syllabus_tree)}",
                        "title": block_text,
                        "type": "topic",
                        "children": []
                    }
                    current_subtopic = None

                # HEURISTIC 2: SUBTOPIC (Bold, slightly larger or same size as body)
                # Must start with number or specific keyword to reduce noise
                elif (is_bold and max_size >= body_size) or (topic_pattern.match(block_text) and max_size >= body_size):
                    if current_subtopic and current_topic:
                        current_topic["children"].append(current_subtopic)
                    
                    # If no topic exists yet, create a dummy one
                    if not current_topic:
                        current_topic = {"id": f"{self.unique_id}_root", "title": "Syllabus Overview", "type": "topic", "children": []}

                    current_subtopic = {
                        "id": f"{current_topic['id']}_{len(current_topic['children'])}",
                        "title": block_text,
                        "type": "subtopic",
                        "content": []
                    }

                # HEURISTIC 3: CONTENT (Body Text)
                elif max_size <= body_size + 1:
                    if current_subtopic:
                        current_subtopic["content"].append(block_text)
                    elif current_topic:
                        # Sometimes text appears directly under a topic
                        # Create implicit subtopic
                        current_subtopic = {
                            "id": f"{current_topic['id']}_intro",
                            "title": "Introduction / Overview",
                            "type": "subtopic",
                            "content": [block_text]
                        }

        # Flush remainders
        if current_subtopic and current_topic:
            current_topic["children"].append(current_subtopic)
        if current_topic:
            syllabus_tree.append(current_topic)

        return {
            "meta": {
                "id": self.unique_id,
                "subject": self.subject_name,
                "code": self.subject_code,
                "level": self.level
            },
            "tree": syllabus_tree
        }

# -----------------------------------------------------------------------------
# 2. THE VECTOR ENGINE (Embeddings & Search)
# -----------------------------------------------------------------------------

def generate_embeddings(texts):
    """Generates embeddings using Gemini API (Batching recommended for production)."""
    if not GEMINI_API_KEY:
        logger.warning("No Gemini API Key found. Skipping embeddings.")
        return [np.zeros(768) for _ in texts] # Dummy vectors

    client = genai.Client(api_key=GEMINI_API_KEY)
    results = []
    
    # Simple batching to avoid hitting limits
    batch_size = 10 
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        try:
            resp = client.models.embed_content(
                model=EMBEDDING_MODEL,
                contents=batch,
            )
            # Handle list of embeddings
            for embedding in resp.embeddings:
                results.append(np.array(embedding.values))
        except Exception as e:
            logger.error(f"Embedding failed: {e}")
            # Fallback for failed batch
            for _ in batch: results.append(np.zeros(768))
            
    return results

def build_index():
    """Walks the directory, parses PDFs, builds JSON tree and Vector Index."""
    global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX
    
    logger.info("🚀 Starting Build Process...")
    
    # 1. Walk Directory
    if not os.path.exists(SYLLABI_DIR):
        logger.error(f"Directory {SYLLABI_DIR} not found.")
        return

    parsed_data = []
    
    for root, dirs, files in os.walk(SYLLABI_DIR):
        for file in files:
            if file.endswith(".pdf"):
                path = os.path.join(root, file)
                parser = PDFParser(path)
                data = parser.parse()
                parsed_data.append(data)
                
                # Store in Map
                SYLLABUS_MAP[data["meta"]["id"]] = data

    # 2. Flatten for Vectorization
    chunks_to_embed = []
    chunk_metadata = []

    for item in parsed_data:
        meta_base = item["meta"]
        for topic in item["tree"]:
            for sub in topic["children"]:
                # Create a rich semantic chunk
                # Format: "Subject Level - Topic - Subtopic: Content"
                text_blob = "\n".join(sub["content"])
                if len(text_blob) < 10: continue # Skip empty chunks
                
                rich_text = f"{meta_base['subject']} {meta_base['level']} - {topic['title']} - {sub['title']}:\n{text_blob}"
                
                chunks_to_embed.append(rich_text)
                chunk_metadata.append({
                    "subject_id": meta_base["id"],
                    "topic_id": topic["id"],
                    "subtopic_id": sub["id"],
                    "title": sub["title"],
                    "content": text_blob
                })

    # 3. Generate Embeddings
    logger.info(f"🧮 Generating embeddings for {len(chunks_to_embed)} chunks...")
    vectors = generate_embeddings(chunks_to_embed)

    # 4. Populate Global DB
    VECTOR_DB = []
    valid_vectors = []
    
    for i, vec in enumerate(vectors):
        VECTOR_DB.append({
            "vector": vec, # Keep for debug/individual access
            "meta": chunk_metadata[i]
        })
        valid_vectors.append(vec)

    if valid_vectors:
        VECTOR_MATRIX = np.vstack(valid_vectors)
    
    logger.info("✅ Indexing Complete.")

# -----------------------------------------------------------------------------
# 3. API SERVER (The Retrieval Layer)
# -----------------------------------------------------------------------------

@app.route('/health', methods=['GET'])
def health():
    return jsonify({"status": "online", "subjects_loaded": list(SYLLABUS_MAP.keys())})

@app.route('/v1/structure/<subject_id>', methods=['GET'])
def get_structure(subject_id):
    """Returns the static JSON tree for navigation UI."""
    data = SYLLABUS_MAP.get(subject_id)
    if not data:
        return jsonify({"error": "Subject not found"}), 404
    return jsonify(data)

@app.route('/v1/search', methods=['POST'])
def search():
    """
    Semantic Retrieval.
    Input: { "query": "...", "filter_subject_id": "..." (optional) }
    """
    if VECTOR_MATRIX is None:
        return jsonify({"error": "Index not ready"}), 503

    data = request.json
    query = data.get("query")
    subject_filter = data.get("filter_subject_id")
    
    if not query:
        return jsonify({"error": "Query required"}), 400

    # 1. Embed Query
    client = genai.Client(api_key=GEMINI_API_KEY)
    try:
        resp = client.models.embed_content(model=EMBEDDING_MODEL, contents=query)
        query_vec = np.array(resp.embeddings[0].values).reshape(1, -1)
    except Exception as e:
        return jsonify({"error": str(e)}), 500

    # 2. Vector Search (Cosine Similarity)
    # scores shape: (1, N_chunks)
    scores = cosine_similarity(query_vec, VECTOR_MATRIX)[0]
    
    # 3. Filter and Sort
    results = []
    # Get top 10 indices
    top_indices = np.argsort(scores)[::-1]
    
    count = 0
    for idx in top_indices:
        if scores[idx] < 0.3: break # Threshold cutoff
        
        entry = VECTOR_DB[idx]
        meta = entry["meta"]
        
        # Apply Filter
        if subject_filter and meta["subject_id"] != subject_filter:
            continue
            
        results.append({
            "score": float(scores[idx]),
            "subject_id": meta["subject_id"],
            "title": meta["title"],
            "content": meta["content"], # Raw text chunk
            "node_id": meta["subtopic_id"] # Pointer to the structure tree
        })
        
        count += 1
        if count >= 5: break # Limit to top 5

    return jsonify({"results": results})

# -----------------------------------------------------------------------------
# 4. STARTUP BOOTSTRAP
# -----------------------------------------------------------------------------

def start_app():
    # In a real deployment, we might load from disk here.
    # For now, we rebuild on boot.
    if not os.path.exists(SYLLABI_DIR):
        os.makedirs(os.path.join(SYLLABI_DIR, "A"), exist_ok=True)
        os.makedirs(os.path.join(SYLLABI_DIR, "O"), exist_ok=True)
        logger.warning(f"Created empty {SYLLABI_DIR}. Please add PDFs.")
    
    # Run Indexer
    build_index()

# Run the builder once on import (or server start)
with app.app_context():
    start_app()

if __name__ == '__main__':
    # Use 7860 for HF Spaces
    app.run(host='0.0.0.0', port=7860)