rairo commited on
Commit
3065188
·
verified ·
1 Parent(s): 5f73dfd

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +367 -0
main.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import logging
4
+ import re
5
+ import time
6
+ import numpy as np
7
+ import fitz # PyMuPDF
8
+ from flask import Flask, request, jsonify
9
+ from flask_cors import CORS
10
+ from google import genai
11
+ from google.genai import types
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+
14
+ # --- CONFIGURATION ---
15
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Directory where your PDFs live (e.g., ./syllabi/A/Physics.pdf)
19
+ SYLLABI_DIR = "syllabi"
20
+ INDEX_FILE = "syllabus_index.json" # Local cache file
21
+
22
+ # Google GenAI Config
23
+ GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
24
+ EMBEDDING_MODEL = "models/text-embedding-004"
25
+
26
+ # --- GLOBAL STATE (IN-MEMORY) ---
27
+ # Structure: { "A_9706": { "title": "Accounting", "tree": [...] }, ... }
28
+ SYLLABUS_MAP = {}
29
+
30
+ # Structure: [ { "id": "...", "vector": [...], "text": "...", "meta": {...} } ]
31
+ VECTOR_DB = []
32
+ VECTOR_MATRIX = None # Numpy array for fast math
33
+
34
+ app = Flask(__name__)
35
+ CORS(app)
36
+
37
+ # -----------------------------------------------------------------------------
38
+ # 1. THE PARSER ENGINE (Extracts Structure from PDF)
39
+ # -----------------------------------------------------------------------------
40
+
41
+ class PDFParser:
42
+ def __init__(self, filepath):
43
+ self.filepath = filepath
44
+ self.filename = os.path.basename(filepath)
45
+ self.doc = fitz.open(filepath)
46
+
47
+ # Determine Subject and Level from filename/path
48
+ # Expected: syllabi/A/Accounting_9706.pdf
49
+ parts = filepath.split(os.sep)
50
+ self.level = parts[-2] if len(parts) > 1 else "General"
51
+ # Extract code if present (e.g., 9618)
52
+ self.subject_code = re.search(r'\d{4}', self.filename)
53
+ self.subject_code = self.subject_code.group(0) if self.subject_code else "0000"
54
+ self.subject_name = self.filename.split('_')[0]
55
+ self.unique_id = f"{self.level}_{self.subject_code}"
56
+
57
+ def get_font_characteristics(self):
58
+ """Scans PDF to find the most common font size (body text)."""
59
+ font_sizes = {}
60
+ for page in self.doc:
61
+ blocks = page.get_text("dict")["blocks"]
62
+ for b in blocks:
63
+ for l in b.get("lines", []):
64
+ for s in l.get("spans", []):
65
+ size = round(s["size"], 1)
66
+ font_sizes[size] = font_sizes.get(size, 0) + len(s["text"])
67
+
68
+ # The font size with the most characters is likely the "Body Text"
69
+ if not font_sizes: return 10.0
70
+ return max(font_sizes, key=font_sizes.get)
71
+
72
+ def parse(self):
73
+ """
74
+ Heuristic parsing:
75
+ - Text significantly larger than body = Topic
76
+ - Bold text slightly larger than body = Subtopic
77
+ - Body text = Content/Objectives
78
+ """
79
+ body_size = self.get_font_characteristics()
80
+ logger.info(f"Parsing {self.filename} (Body size approx {body_size}pt)")
81
+
82
+ syllabus_tree = []
83
+ current_topic = None
84
+ current_subtopic = None
85
+
86
+ # Regex to detect "Topic 1" or "1.1" or "Key Question"
87
+ topic_pattern = re.compile(r'^(\d+\.?\s|Key Question\s)', re.IGNORECASE)
88
+
89
+ for page in self.doc:
90
+ blocks = page.get_text("dict")["blocks"]
91
+ for b in blocks:
92
+ block_text = ""
93
+ max_size = 0
94
+ is_bold = False
95
+
96
+ # Reconstruct line text and finding max font style
97
+ for l in b.get("lines", []):
98
+ for s in l.get("spans", []):
99
+ text = s["text"].strip()
100
+ if not text: continue
101
+ block_text += text + " "
102
+ if s["size"] > max_size: max_size = s["size"]
103
+ if "bold" in s["font"].lower(): is_bold = True
104
+
105
+ block_text = block_text.strip()
106
+ if len(block_text) < 3: continue # Skip noise
107
+
108
+ # HEURISTIC 1: TOPIC (Large Header)
109
+ # Usually 2pt+ larger than body
110
+ if max_size > body_size + 2:
111
+ # Save previous
112
+ if current_subtopic and current_topic:
113
+ current_topic["children"].append(current_subtopic)
114
+ current_subtopic = None
115
+ if current_topic:
116
+ syllabus_tree.append(current_topic)
117
+
118
+ current_topic = {
119
+ "id": f"{self.unique_id}_{len(syllabus_tree)}",
120
+ "title": block_text,
121
+ "type": "topic",
122
+ "children": []
123
+ }
124
+ current_subtopic = None
125
+
126
+ # HEURISTIC 2: SUBTOPIC (Bold, slightly larger or same size as body)
127
+ # Must start with number or specific keyword to reduce noise
128
+ elif (is_bold and max_size >= body_size) or (topic_pattern.match(block_text) and max_size >= body_size):
129
+ if current_subtopic and current_topic:
130
+ current_topic["children"].append(current_subtopic)
131
+
132
+ # If no topic exists yet, create a dummy one
133
+ if not current_topic:
134
+ current_topic = {"id": f"{self.unique_id}_root", "title": "Syllabus Overview", "type": "topic", "children": []}
135
+
136
+ current_subtopic = {
137
+ "id": f"{current_topic['id']}_{len(current_topic['children'])}",
138
+ "title": block_text,
139
+ "type": "subtopic",
140
+ "content": []
141
+ }
142
+
143
+ # HEURISTIC 3: CONTENT (Body Text)
144
+ elif max_size <= body_size + 1:
145
+ if current_subtopic:
146
+ current_subtopic["content"].append(block_text)
147
+ elif current_topic:
148
+ # Sometimes text appears directly under a topic
149
+ # Create implicit subtopic
150
+ current_subtopic = {
151
+ "id": f"{current_topic['id']}_intro",
152
+ "title": "Introduction / Overview",
153
+ "type": "subtopic",
154
+ "content": [block_text]
155
+ }
156
+
157
+ # Flush remainders
158
+ if current_subtopic and current_topic:
159
+ current_topic["children"].append(current_subtopic)
160
+ if current_topic:
161
+ syllabus_tree.append(current_topic)
162
+
163
+ return {
164
+ "meta": {
165
+ "id": self.unique_id,
166
+ "subject": self.subject_name,
167
+ "code": self.subject_code,
168
+ "level": self.level
169
+ },
170
+ "tree": syllabus_tree
171
+ }
172
+
173
+ # -----------------------------------------------------------------------------
174
+ # 2. THE VECTOR ENGINE (Embeddings & Search)
175
+ # -----------------------------------------------------------------------------
176
+
177
+ def generate_embeddings(texts):
178
+ """Generates embeddings using Gemini API (Batching recommended for production)."""
179
+ if not GEMINI_API_KEY:
180
+ logger.warning("No Gemini API Key found. Skipping embeddings.")
181
+ return [np.zeros(768) for _ in texts] # Dummy vectors
182
+
183
+ client = genai.Client(api_key=GEMINI_API_KEY)
184
+ results = []
185
+
186
+ # Simple batching to avoid hitting limits
187
+ batch_size = 10
188
+ for i in range(0, len(texts), batch_size):
189
+ batch = texts[i:i+batch_size]
190
+ try:
191
+ resp = client.models.embed_content(
192
+ model=EMBEDDING_MODEL,
193
+ contents=batch,
194
+ )
195
+ # Handle list of embeddings
196
+ for embedding in resp.embeddings:
197
+ results.append(np.array(embedding.values))
198
+ except Exception as e:
199
+ logger.error(f"Embedding failed: {e}")
200
+ # Fallback for failed batch
201
+ for _ in batch: results.append(np.zeros(768))
202
+
203
+ return results
204
+
205
+ def build_index():
206
+ """Walks the directory, parses PDFs, builds JSON tree and Vector Index."""
207
+ global SYLLABUS_MAP, VECTOR_DB, VECTOR_MATRIX
208
+
209
+ logger.info("🚀 Starting Build Process...")
210
+
211
+ # 1. Walk Directory
212
+ if not os.path.exists(SYLLABI_DIR):
213
+ logger.error(f"Directory {SYLLABI_DIR} not found.")
214
+ return
215
+
216
+ parsed_data = []
217
+
218
+ for root, dirs, files in os.walk(SYLLABI_DIR):
219
+ for file in files:
220
+ if file.endswith(".pdf"):
221
+ path = os.path.join(root, file)
222
+ parser = PDFParser(path)
223
+ data = parser.parse()
224
+ parsed_data.append(data)
225
+
226
+ # Store in Map
227
+ SYLLABUS_MAP[data["meta"]["id"]] = data
228
+
229
+ # 2. Flatten for Vectorization
230
+ chunks_to_embed = []
231
+ chunk_metadata = []
232
+
233
+ for item in parsed_data:
234
+ meta_base = item["meta"]
235
+ for topic in item["tree"]:
236
+ for sub in topic["children"]:
237
+ # Create a rich semantic chunk
238
+ # Format: "Subject Level - Topic - Subtopic: Content"
239
+ text_blob = "\n".join(sub["content"])
240
+ if len(text_blob) < 10: continue # Skip empty chunks
241
+
242
+ rich_text = f"{meta_base['subject']} {meta_base['level']} - {topic['title']} - {sub['title']}:\n{text_blob}"
243
+
244
+ chunks_to_embed.append(rich_text)
245
+ chunk_metadata.append({
246
+ "subject_id": meta_base["id"],
247
+ "topic_id": topic["id"],
248
+ "subtopic_id": sub["id"],
249
+ "title": sub["title"],
250
+ "content": text_blob
251
+ })
252
+
253
+ # 3. Generate Embeddings
254
+ logger.info(f"🧮 Generating embeddings for {len(chunks_to_embed)} chunks...")
255
+ vectors = generate_embeddings(chunks_to_embed)
256
+
257
+ # 4. Populate Global DB
258
+ VECTOR_DB = []
259
+ valid_vectors = []
260
+
261
+ for i, vec in enumerate(vectors):
262
+ VECTOR_DB.append({
263
+ "vector": vec, # Keep for debug/individual access
264
+ "meta": chunk_metadata[i]
265
+ })
266
+ valid_vectors.append(vec)
267
+
268
+ if valid_vectors:
269
+ VECTOR_MATRIX = np.vstack(valid_vectors)
270
+
271
+ logger.info("✅ Indexing Complete.")
272
+
273
+ # -----------------------------------------------------------------------------
274
+ # 3. API SERVER (The Retrieval Layer)
275
+ # -----------------------------------------------------------------------------
276
+
277
+ @app.route('/health', methods=['GET'])
278
+ def health():
279
+ return jsonify({"status": "online", "subjects_loaded": list(SYLLABUS_MAP.keys())})
280
+
281
+ @app.route('/v1/structure/<subject_id>', methods=['GET'])
282
+ def get_structure(subject_id):
283
+ """Returns the static JSON tree for navigation UI."""
284
+ data = SYLLABUS_MAP.get(subject_id)
285
+ if not data:
286
+ return jsonify({"error": "Subject not found"}), 404
287
+ return jsonify(data)
288
+
289
+ @app.route('/v1/search', methods=['POST'])
290
+ def search():
291
+ """
292
+ Semantic Retrieval.
293
+ Input: { "query": "...", "filter_subject_id": "..." (optional) }
294
+ """
295
+ if VECTOR_MATRIX is None:
296
+ return jsonify({"error": "Index not ready"}), 503
297
+
298
+ data = request.json
299
+ query = data.get("query")
300
+ subject_filter = data.get("filter_subject_id")
301
+
302
+ if not query:
303
+ return jsonify({"error": "Query required"}), 400
304
+
305
+ # 1. Embed Query
306
+ client = genai.Client(api_key=GEMINI_API_KEY)
307
+ try:
308
+ resp = client.models.embed_content(model=EMBEDDING_MODEL, contents=query)
309
+ query_vec = np.array(resp.embeddings[0].values).reshape(1, -1)
310
+ except Exception as e:
311
+ return jsonify({"error": str(e)}), 500
312
+
313
+ # 2. Vector Search (Cosine Similarity)
314
+ # scores shape: (1, N_chunks)
315
+ scores = cosine_similarity(query_vec, VECTOR_MATRIX)[0]
316
+
317
+ # 3. Filter and Sort
318
+ results = []
319
+ # Get top 10 indices
320
+ top_indices = np.argsort(scores)[::-1]
321
+
322
+ count = 0
323
+ for idx in top_indices:
324
+ if scores[idx] < 0.3: break # Threshold cutoff
325
+
326
+ entry = VECTOR_DB[idx]
327
+ meta = entry["meta"]
328
+
329
+ # Apply Filter
330
+ if subject_filter and meta["subject_id"] != subject_filter:
331
+ continue
332
+
333
+ results.append({
334
+ "score": float(scores[idx]),
335
+ "subject_id": meta["subject_id"],
336
+ "title": meta["title"],
337
+ "content": meta["content"], # Raw text chunk
338
+ "node_id": meta["subtopic_id"] # Pointer to the structure tree
339
+ })
340
+
341
+ count += 1
342
+ if count >= 5: break # Limit to top 5
343
+
344
+ return jsonify({"results": results})
345
+
346
+ # -----------------------------------------------------------------------------
347
+ # 4. STARTUP BOOTSTRAP
348
+ # -----------------------------------------------------------------------------
349
+
350
+ def start_app():
351
+ # In a real deployment, we might load from disk here.
352
+ # For now, we rebuild on boot.
353
+ if not os.path.exists(SYLLABI_DIR):
354
+ os.makedirs(os.path.join(SYLLABI_DIR, "A"), exist_ok=True)
355
+ os.makedirs(os.path.join(SYLLABI_DIR, "O"), exist_ok=True)
356
+ logger.warning(f"Created empty {SYLLABI_DIR}. Please add PDFs.")
357
+
358
+ # Run Indexer
359
+ build_index()
360
+
361
+ # Run the builder once on import (or server start)
362
+ with app.app_context():
363
+ start_app()
364
+
365
+ if __name__ == '__main__':
366
+ # Use 7860 for HF Spaces
367
+ app.run(host='0.0.0.0', port=7860)