Spaces:

rairo
/

marka-data-api

Sleeping

App Files Files Community

marka-data-api / main.py

rairo

Create main.py

3065188 verified about 1 month ago

raw

history blame contribute delete

13.5 kB

	import os
	import json
	import logging
	import re
	import time
	import numpy as np
	import fitz # PyMuPDF
	from flask import Flask, request, jsonify
	from flask_cors import CORS
	from google import genai
	from google.genai import types
	from sklearn.metrics.pairwise import cosine_similarity

	# --- CONFIGURATION ---
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Directory where your PDFs live (e.g., ./syllabi/A/Physics.pdf)
	SYLLABI_DIR = "syllabi"
	INDEX_FILE = "syllabus_index.json" # Local cache file

	# Google GenAI Config
	GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
	EMBEDDING_MODEL = "models/text-embedding-004"

	# --- GLOBAL STATE (IN-MEMORY) ---
	# Structure: { "A_9706": { "title": "Accounting", "tree": [...] }, ... }
	SYLLABUS_MAP = {}

	# Structure: [ { "id": "...", "vector": [...], "text": "...", "meta": {...} } ]
	VECTOR_DB = []
	VECTOR_MATRIX = None # Numpy array for fast math

	app = Flask(__name__)
	CORS(app)

	# -----------------------------------------------------------------------------
	# 1. THE PARSER ENGINE (Extracts Structure from PDF)
	# -----------------------------------------------------------------------------

	class PDFParser:
	def __init__(self, filepath):
	self.filepath = filepath
	self.filename = os.path.basename(filepath)
	self.doc = fitz.open(filepath)

	# Determine Subject and Level from filename/path
	# Expected: syllabi/A/Accounting_9706.pdf
	parts = filepath.split(os.sep)
	self.level = parts[-2] if len(parts) > 1 else "General"
	# Extract code if present (e.g., 9618)
	self.subject_code = re.search(r'\d{4}', self.filename)
	self.subject_code = self.subject_code.group(0) if self.subject_code else "0000"
	self.subject_name = self.filename.split('_')[0]
	self.unique_id = f"{self.level}_{self.subject_code}"

	def get_font_characteristics(self):
	"""Scans PDF to find the most common font size (body text)."""
	font_sizes = {}
	for page in self.doc:
	blocks = page.get_text("dict")["blocks"]
	for b in blocks:
	for l in b.get("lines", []):
	for s in l.get("spans", []):
	size = round(s["size"], 1)
	font_sizes[size] = font_sizes.get(size, 0) + len(s["text"])

	# The font size with the most characters is likely the "Body Text"
	if not font_sizes: return 10.0
	return max(font_sizes, key=font_sizes.get)

	def parse(self):
	"""
	Heuristic parsing:
	- Text significantly larger than body = Topic
	- Bold text slightly larger than body = Subtopic
	- Body text = Content/Objectives
	"""
	body_size = self.get_font_characteristics()
	logger.info(f"Parsing {self.filename} (Body size approx {body_size}pt)")

	syllabus_tree = []
	current_topic = None
	current_subtopic = None

	# Regex to detect "Topic 1" or "1.1" or "Key Question"
	topic_pattern = re.compile(r'^(\d+\.?\s\|Key Question\s)', re.IGNORECASE)

	for page in self.doc:
	blocks = page.get_text("dict")["blocks"]
	for b in blocks:
	block_text = ""
	max_size = 0
	is_bold = False

	# Reconstruct line text and finding max font style
	for l in b.get("lines", []):
	for s in l.get("spans", []):
	text = s["text"].strip()
	if not text: continue
	block_text += text + " "
	if s["size"] > max_size: max_size = s["size"]
	if "bold" in s["font"].lower(): is_bold = True

	block_text = block_text.strip()
	if len(block_text) < 3: continue # Skip noise

	# HEURISTIC 1: TOPIC (Large Header)
	# Usually 2pt+ larger than body
	if max_size > body_size + 2:
	# Save previous
	if current_subtopic and current_topic:
	current_topic["children"].append(current_subtopic)
	current_subtopic = None
	if current_topic:
	syllabus_tree.append(current_topic)

	current_topic = {
	"id": f"{self.unique_id}_{len(syllabus_tree)}",
	"title": block_text,
	"type": "topic",
	"children": []
	}
	current_subtopic = None

	# HEURISTIC 2: SUBTOPIC (Bold, slightly larger or same size as body)
	# Must start with number or specific keyword to reduce noise
	elif (is_bold and max_size >= body_size) or (topic_pattern.match(block_text) and max_size >= body_size):
	if current_subtopic and current_topic:
	current_topic["children"].append(current_subtopic)

	# If no topic exists yet, create a dummy one
	if not current_topic:
	current_topic = {"id": f"{self.unique_id}_root", "title": "Syllabus Overview", "type": "topic", "children": []}

	current_subtopic = {
	"id": f"{current_topic['id']}_{len(current_topic['children'])}",
	"title": block_text,
	"type": "subtopic",
	"content": []
	}

	# HEURISTIC 3: CONTENT (Body Text)
	elif max_size <= body_size + 1:
	if current_subtopic:
	current_subtopic["content"].append(block_text)
	elif current_topic:
	# Sometimes text appears directly under a topic
	# Create implicit subtopic
	current_subtopic = {
	"id": f"{current_topic['id']}_intro",
	"title": "Introduction / Overview",
	"type": "subtopic",
	"content": [block_text]
	}

	# Flush remainders
	if current_subtopic and current_topic:
	current_topic["children"].append(current_subtopic)
	if current_topic:
	syllabus_tree.append(current_topic)

	return {
	"meta": {
	"id": self.unique_id,
	"subject": self.subject_name,
	"code": self.subject_code,
	"level": self.level
	},
	"tree": syllabus_tree
	}

	# -----------------------------------------------------------------------------
	# 2. THE VECTOR ENGINE (Embeddings & Search)
	# -----------------------------------------------------------------------------

	def generate_embeddings(texts):
	"""Generates embeddings using Gemini API (Batching recommended for production)."""
	if not GEMINI_API_KEY:
	logger.warning("No Gemini API Key found. Skipping embeddings.")
	return [np.zeros(768) for _ in texts] # Dummy vectors

	client = genai.Client(api_key=GEMINI_API_KEY)
	results = []

	# Simple batching to avoid hitting limits
	batch_size = 10
	for i in range(0, len(texts), batch_size):
	batch = texts[i:i+batch_size]
	try:
	resp = client.models.embed_content(
	model=EMBEDDING_MODEL,
	contents=batch,
	)
	# Handle list of embeddings
	for embedding in resp.embeddings:
	results.append(np.array(embedding.values))
	except Exception as e:
	logger.error(f"Embedding failed: {e}")
	# Fallback for failed batch
	for _ in batch: results.append(np.zeros(768))

	return results

	def build_index():
	"""Walks the directory, parses PDFs, builds JSON tree and Vector Index."""
	global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX

	logger.info("🚀 Starting Build Process...")

	# 1. Walk Directory
	if not os.path.exists(SYLLABI_DIR):
	logger.error(f"Directory {SYLLABI_DIR} not found.")
	return

	parsed_data = []

	for root, dirs, files in os.walk(SYLLABI_DIR):
	for file in files:
	if file.endswith(".pdf"):
	path = os.path.join(root, file)
	parser = PDFParser(path)
	data = parser.parse()
	parsed_data.append(data)

	# Store in Map
	SYLLABUS_MAP[data["meta"]["id"]] = data

	# 2. Flatten for Vectorization
	chunks_to_embed = []
	chunk_metadata = []

	for item in parsed_data:
	meta_base = item["meta"]
	for topic in item["tree"]:
	for sub in topic["children"]:
	# Create a rich semantic chunk
	# Format: "Subject Level - Topic - Subtopic: Content"
	text_blob = "\n".join(sub["content"])
	if len(text_blob) < 10: continue # Skip empty chunks

	rich_text = f"{meta_base['subject']} {meta_base['level']} - {topic['title']} - {sub['title']}:\n{text_blob}"

	chunks_to_embed.append(rich_text)
	chunk_metadata.append({
	"subject_id": meta_base["id"],
	"topic_id": topic["id"],
	"subtopic_id": sub["id"],
	"title": sub["title"],
	"content": text_blob
	})

	# 3. Generate Embeddings
	logger.info(f"🧮 Generating embeddings for {len(chunks_to_embed)} chunks...")
	vectors = generate_embeddings(chunks_to_embed)

	# 4. Populate Global DB
	VECTOR_DB = []
	valid_vectors = []

	for i, vec in enumerate(vectors):
	VECTOR_DB.append({
	"vector": vec, # Keep for debug/individual access
	"meta": chunk_metadata[i]
	})
	valid_vectors.append(vec)

	if valid_vectors:
	VECTOR_MATRIX = np.vstack(valid_vectors)

	logger.info("✅ Indexing Complete.")

	# -----------------------------------------------------------------------------
	# 3. API SERVER (The Retrieval Layer)
	# -----------------------------------------------------------------------------

	@app.route('/health', methods=['GET'])
	def health():
	return jsonify({"status": "online", "subjects_loaded": list(SYLLABUS_MAP.keys())})

	@app.route('/v1/structure/<subject_id>', methods=['GET'])
	def get_structure(subject_id):
	"""Returns the static JSON tree for navigation UI."""
	data = SYLLABUS_MAP.get(subject_id)
	if not data:
	return jsonify({"error": "Subject not found"}), 404
	return jsonify(data)

	@app.route('/v1/search', methods=['POST'])
	def search():
	"""
	Semantic Retrieval.
	Input: { "query": "...", "filter_subject_id": "..." (optional) }
	"""
	if VECTOR_MATRIX is None:
	return jsonify({"error": "Index not ready"}), 503

	data = request.json
	query = data.get("query")
	subject_filter = data.get("filter_subject_id")

	if not query:
	return jsonify({"error": "Query required"}), 400

	# 1. Embed Query
	client = genai.Client(api_key=GEMINI_API_KEY)
	try:
	resp = client.models.embed_content(model=EMBEDDING_MODEL, contents=query)
	query_vec = np.array(resp.embeddings[0].values).reshape(1, -1)
	except Exception as e:
	return jsonify({"error": str(e)}), 500

	# 2. Vector Search (Cosine Similarity)
	# scores shape: (1, N_chunks)
	scores = cosine_similarity(query_vec, VECTOR_MATRIX)[0]

	# 3. Filter and Sort
	results = []
	# Get top 10 indices
	top_indices = np.argsort(scores)[::-1]

	count = 0
	for idx in top_indices:
	if scores[idx] < 0.3: break # Threshold cutoff

	entry = VECTOR_DB[idx]
	meta = entry["meta"]

	# Apply Filter
	if subject_filter and meta["subject_id"] != subject_filter:
	continue

	results.append({
	"score": float(scores[idx]),
	"subject_id": meta["subject_id"],
	"title": meta["title"],
	"content": meta["content"], # Raw text chunk
	"node_id": meta["subtopic_id"] # Pointer to the structure tree
	})

	count += 1
	if count >= 5: break # Limit to top 5

	return jsonify({"results": results})

	# -----------------------------------------------------------------------------
	# 4. STARTUP BOOTSTRAP
	# -----------------------------------------------------------------------------

	def start_app():
	# In a real deployment, we might load from disk here.
	# For now, we rebuild on boot.
	if not os.path.exists(SYLLABI_DIR):
	os.makedirs(os.path.join(SYLLABI_DIR, "A"), exist_ok=True)
	os.makedirs(os.path.join(SYLLABI_DIR, "O"), exist_ok=True)
	logger.warning(f"Created empty {SYLLABI_DIR}. Please add PDFs.")

	# Run Indexer
	build_index()

	# Run the builder once on import (or server start)
	with app.app_context():
	start_app()

	if __name__ == '__main__':
	# Use 7860 for HF Spaces
	app.run(host='0.0.0.0', port=7860)