Hebaelsayed commited on
Commit
3508eff
Β·
verified Β·
1 Parent(s): 081bf0a

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +534 -404
src/streamlit_app.py CHANGED
@@ -2,10 +2,11 @@ import streamlit as st
2
  import os
3
  import time
4
  import base64
 
5
  from io import BytesIO
6
  from PIL import Image
7
  import PyPDF2
8
- from pdf2image import convert_from_bytes
9
  from anthropic import Anthropic
10
  from qdrant_client import QdrantClient
11
  from qdrant_client.models import Distance, VectorParams, PointStruct
@@ -13,53 +14,103 @@ from sentence_transformers import SentenceTransformer
13
  from huggingface_hub import hf_hub_download, list_repo_files
14
 
15
  # ============================================================================
16
- # MATH AI SYSTEM - READS FROM HF DATASET (PERMANENT STORAGE!)
17
  # ============================================================================
18
 
19
  st.set_page_config(
20
- page_title="Math AI System",
21
  page_icon="πŸŽ“",
22
  layout="wide"
23
  )
24
 
25
  COLLECTION_NAME = "math_knowledge_base"
 
26
 
27
- # YOUR DATASET =
28
- DATASET_REPO = "Hebaelsayed/math-ai-documents"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # ============================================================================
31
  # CACHED RESOURCES
32
  # ============================================================================
33
 
34
  @st.cache_resource
35
- def get_clients():
36
- """Initialize clients"""
37
- qdrant = QdrantClient(
38
  url=os.getenv("QDRANT_URL"),
39
  api_key=os.getenv("QDRANT_API_KEY")
40
  )
41
- claude = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
42
- embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
43
- return qdrant, claude, embedder
 
 
 
 
 
 
 
44
 
45
  # ============================================================================
46
- # DATASET OPERATIONS (Reads from HF Dataset)
47
  # ============================================================================
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def list_dataset_files(folder_path):
50
- """List all PDF files in a folder from HF Dataset"""
51
  try:
52
- # Get HF token from environment
53
  hf_token = os.getenv("HF_TOKEN")
54
-
55
- # List all files in the dataset
56
  all_files = list_repo_files(
57
  repo_id=DATASET_REPO,
58
  repo_type="dataset",
59
  token=hf_token
60
  )
61
 
62
- # Filter for PDFs in specific folder
63
  pdf_files = [
64
  f for f in all_files
65
  if f.startswith(folder_path) and f.endswith('.pdf')
@@ -71,12 +122,11 @@ def list_dataset_files(folder_path):
71
  st.error(f"Error listing files: {e}")
72
  return []
73
 
74
- def download_file_from_dataset(file_path):
75
- """Download a file from HF Dataset"""
76
  try:
77
  hf_token = os.getenv("HF_TOKEN")
78
 
79
- # Download file
80
  local_path = hf_hub_download(
81
  repo_id=DATASET_REPO,
82
  filename=file_path,
@@ -87,43 +137,39 @@ def download_file_from_dataset(file_path):
87
  return local_path
88
 
89
  except Exception as e:
90
- st.error(f"Error downloading {file_path}: {e}")
91
  return None
92
 
93
- # ============================================================================
94
- # PROCESSING FUNCTIONS
95
- # ============================================================================
96
-
97
  def extract_text_from_pdf(pdf_path):
98
- """Extract text from PDF file"""
99
  try:
100
  with open(pdf_path, 'rb') as file:
101
- pdf_reader = PyPDF2.PdfReader(file)
102
  text = ""
103
- for page_num, page in enumerate(pdf_reader.pages):
104
  text += f"\n\n=== Page {page_num + 1} ===\n\n{page.extract_text()}"
105
  return text
106
  except Exception as e:
107
- st.error(f"PDF extraction error: {e}")
108
  return None
109
 
110
  def pdf_to_images(pdf_path):
111
- """Convert PDF to images"""
112
  try:
113
- from pdf2image import convert_from_path
114
  images = convert_from_path(pdf_path, dpi=200)
115
  return images
116
  except Exception as e:
117
- st.error(f"Conversion error: {e}")
 
118
  return []
119
 
120
  def resize_image(image, max_size=(2048, 2048)):
121
- """Resize for Claude"""
122
  image.thumbnail(max_size, Image.Resampling.LANCZOS)
123
  return image
124
 
125
  def image_to_base64(image):
126
- """Convert to base64"""
127
  buffered = BytesIO()
128
  image.save(buffered, format="PNG")
129
  return base64.b64encode(buffered.getvalue()).decode()
@@ -134,7 +180,7 @@ def ocr_with_claude(claude_client, image, context=""):
134
  resized = resize_image(image.copy())
135
  img_b64 = image_to_base64(resized)
136
 
137
- prompt = f"""Transcribe this handwritten math solution.
138
 
139
  STYLE: Italian cursive (connected letters)
140
  LANGUAGE: English
@@ -145,9 +191,9 @@ INSTRUCTIONS:
145
  1. Transcribe in English
146
  2. Use proper math notation: ∫, βˆ‘, √, βˆ‚, etc.
147
  3. Maintain structure
148
- 4. Mark unclear parts: [unclear: guess]
149
 
150
- OUTPUT: Just the transcription."""
151
 
152
  try:
153
  message = claude_client.messages.create(
@@ -167,10 +213,11 @@ OUTPUT: Just the transcription."""
167
  return message.content[0].text, message.usage.input_tokens + message.usage.output_tokens
168
 
169
  except Exception as e:
 
170
  return None, 0
171
 
172
  def chunk_text(text, chunk_size=150, overlap=30):
173
- """Split into chunks"""
174
  words = text.split()
175
  chunks = []
176
  for i in range(0, len(words), chunk_size - overlap):
@@ -180,7 +227,7 @@ def chunk_text(text, chunk_size=150, overlap=30):
180
  return chunks
181
 
182
  def get_vector_count(qdrant):
183
- """Get total vectors"""
184
  try:
185
  count = 0
186
  offset = None
@@ -203,15 +250,16 @@ def get_vector_count(qdrant):
203
  return 0
204
 
205
  # ============================================================================
206
- # INITIALIZE
207
  # ============================================================================
208
 
209
  try:
210
- qdrant, claude, embedder = get_clients()
 
211
  st.sidebar.success("βœ… System Ready")
212
  except Exception as e:
213
- st.error(f"❌ Init failed: {e}")
214
- st.info("Add these in Settings β†’ Secrets: QDRANT_URL, QDRANT_API_KEY, ANTHROPIC_API_KEY, HF_TOKEN")
215
  st.stop()
216
 
217
  # ============================================================================
@@ -219,437 +267,519 @@ except Exception as e:
219
  # ============================================================================
220
 
221
  st.sidebar.title("πŸŽ“ Math AI System")
222
-
223
- mode = st.sidebar.radio(
224
- "Mode:",
225
- ["πŸ” Search & Solve", "πŸ—οΈ Process Dataset Files", "πŸ“Š Stats"],
226
- index=0
227
- )
228
-
229
- st.sidebar.markdown("---")
230
 
231
  try:
232
  vector_count = get_vector_count(qdrant)
233
- st.sidebar.metric("Vectors", f"{vector_count:,}")
 
 
 
234
  except:
235
- pass
 
 
236
 
237
  # ============================================================================
238
- # MODE 1: SEARCH & SOLVE
239
  # ============================================================================
240
 
241
- if mode == "πŸ” Search & Solve":
 
 
 
 
 
 
 
 
 
 
242
 
243
- st.title("πŸ” Math Problem Solver")
 
244
 
245
- problem = st.text_area(
246
- "Enter problem:",
247
- placeholder="Find the gradient of L(w) = (1/2)||Xw - y||Β²",
248
- height=150
249
- )
250
 
251
- top_k = st.slider("Retrieve:", 3, 20, 5)
 
252
 
253
- if st.button("πŸš€ SOLVE", type="primary") and problem:
254
-
255
- with st.spinner("Searching..."):
256
- query_emb = embedder.encode(problem)
257
-
258
- try:
259
- results = qdrant.search(
260
- collection_name=COLLECTION_NAME,
261
- query_vector=query_emb.tolist(),
262
- limit=top_k
263
- )
264
- except:
265
- results = []
266
-
267
- if not results:
268
- st.warning("No context found. Process your files in 'Process Dataset Files' mode.")
269
- else:
270
- st.success(f"Found {len(results)} references!")
271
-
272
- with st.expander("References"):
273
- for i, r in enumerate(results, 1):
274
- st.markdown(f"**{i}.** {r.payload['content'][:200]}...")
275
- st.caption(f"Source: {r.payload.get('source_name')}")
276
 
277
- with st.spinner("Generating solution..."):
 
 
 
278
 
279
- context = "\n\n".join([r.payload['content'] for r in results])
 
280
 
281
- prompt = f"""Solve this problem using the references.
282
-
283
- PROBLEM: {problem}
284
-
285
- REFERENCES: {context}
286
-
287
- FORMAT:
288
- ## SOLUTION
289
- [Step-by-step]
290
-
291
- ## REASONING
292
- [Why this approach]
293
-
294
- ## REFERENCES USED
295
- [Which sources helped]"""
296
-
297
- try:
298
- message = claude.messages.create(
299
- model="claude-sonnet-4-20250514",
300
- max_tokens=4000,
301
- messages=[{"role": "user", "content": prompt}]
302
- )
303
-
304
- st.markdown("---")
305
- st.markdown(message.content[0].text)
306
 
307
- st.download_button(
308
- "πŸ“₯ Download",
309
- message.content[0].text,
310
- file_name=f"solution.md"
 
 
311
  )
312
-
313
- except Exception as e:
314
- st.error(f"Error: {e}")
315
-
316
- # ============================================================================
317
- # MODE 2: PROCESS DATASET FILES
318
- # ============================================================================
319
-
320
- elif mode == "πŸ—οΈ Process Dataset Files":
321
 
322
- st.title("πŸ—οΈ Process Files from HF Dataset")
 
 
 
 
 
 
 
323
 
324
- st.info(f"""
325
- **Dataset:** `{DATASET_REPO}`
326
 
327
- Files are stored permanently in your HF Dataset.
328
- Process them once, search forever!
329
- """)
330
 
331
- # Check if HF token exists
332
- if not os.getenv("HF_TOKEN"):
333
- st.error("⚠️ Missing HF_TOKEN! Add it in Settings β†’ Repository Secrets")
334
- st.info("""
335
- **How to get your HF Token:**
336
- 1. Go to: https://huggingface.co/settings/tokens
337
- 2. Click "New token"
338
- 3. Name: "math-ai-access"
339
- 4. Type: Read
340
- 5. Copy the token
341
- 6. Add as HF_TOKEN in Space Settings β†’ Secrets
342
- """)
343
- st.stop()
344
 
345
- # Create collection if needed
346
- st.header("Step 1: Setup Collection")
 
 
 
347
 
348
- try:
349
- collections = qdrant.get_collections().collections
350
- exists = any(c.name == COLLECTION_NAME for c in collections)
 
351
 
352
- if exists:
353
- st.success(f"βœ… Collection exists")
354
- else:
355
- if st.button("Create Collection"):
356
- qdrant.create_collection(
357
- collection_name=COLLECTION_NAME,
358
- vectors_config=VectorParams(size=384, distance=Distance.COSINE)
359
- )
360
- st.success("Created!")
361
- st.rerun()
362
- except Exception as e:
363
- st.error(f"Error: {e}")
 
 
364
 
365
  st.markdown("---")
366
 
367
- # Process files
368
- st.header("Step 2: Process Files")
369
 
370
- tab1, tab2, tab3 = st.tabs(["πŸ“š Books", "πŸ“ Exams", "πŸ–ŠοΈ Handwritten Answers"])
 
 
 
371
 
372
  # ========================================================================
373
- # BOOKS
374
  # ========================================================================
375
 
376
- with tab1:
377
- st.subheader("Process Books (Typed PDFs)")
378
 
379
- if st.button("πŸ“š List Books in Dataset"):
380
- book_files = list_dataset_files("books/")
381
-
382
- if book_files:
383
- st.write(f"Found {len(book_files)} books:")
384
- for f in book_files:
385
- st.text(f"β€’ {f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
 
387
- st.session_state.book_files = book_files
388
- else:
389
- st.warning("No books found in dataset/books/ folder")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
- if 'book_files' in st.session_state and st.button("πŸš€ Process All Books"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
- for book_file in st.session_state.book_files:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
 
395
- with st.expander(f"Processing {book_file}"):
 
 
 
 
 
 
396
 
397
- try:
398
- # Download
399
- st.write("πŸ“₯ Downloading...")
400
- local_path = download_file_from_dataset(book_file)
401
-
402
- if not local_path:
403
- continue
404
-
405
- # Extract
406
- st.write("πŸ“– Extracting text...")
407
- text = extract_text_from_pdf(local_path)
408
-
409
- if not text:
410
- continue
411
-
412
- st.write(f"βœ… {len(text):,} chars")
413
-
414
- # Chunk
415
- chunks = chunk_text(text)
416
- st.write(f"βœ‚οΈ {len(chunks)} chunks")
417
-
418
- # Embed
419
- embeddings = embedder.encode(chunks, show_progress_bar=False)
420
-
421
- # Upload
422
- points = []
423
- for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
424
- points.append(PointStruct(
425
- id=abs(hash(f"{book_file}_{i}_{time.time()}")) % (2**63),
426
- vector=emb.tolist(),
427
- payload={
428
- "content": chunk,
429
- "source_name": book_file.split('/')[-1],
430
- "source_type": "book",
431
- "chunk_index": i
432
- }
433
- ))
434
 
435
- qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
436
- st.success(f"βœ… Uploaded {len(points)} vectors!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
 
438
- except Exception as e:
439
- st.error(f"Error: {e}")
 
440
 
441
  # ========================================================================
442
- # EXAMS
443
  # ========================================================================
444
 
445
- with tab2:
446
- st.subheader("Process Exams (Typed PDFs)")
447
 
448
- if st.button("πŸ“ List Exams in Dataset"):
449
- exam_files = list_dataset_files("exams/")
450
-
451
- if exam_files:
452
- st.write(f"Found {len(exam_files)} exams:")
453
- for f in exam_files:
454
- st.text(f"β€’ {f}")
455
-
456
- st.session_state.exam_files = exam_files
457
- else:
458
- st.warning("No exams found")
459
 
460
- if 'exam_files' in st.session_state and st.button("πŸš€ Process All Exams"):
461
-
462
- for exam_file in st.session_state.exam_files:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
 
464
- with st.expander(f"Processing {exam_file}"):
 
465
 
466
- try:
467
- local_path = download_file_from_dataset(exam_file)
468
- text = extract_text_from_pdf(local_path)
 
 
 
 
 
469
 
470
- if not text:
471
- continue
 
 
472
 
473
- st.write(f"βœ… {len(text):,} chars")
 
 
 
474
 
475
- chunks = chunk_text(text)
476
- embeddings = embedder.encode(chunks, show_progress_bar=False)
 
 
 
477
 
 
478
  points = []
479
- for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
480
  points.append(PointStruct(
481
- id=abs(hash(f"{exam_file}_{i}_{time.time()}")) % (2**63),
482
  vector=emb.tolist(),
483
  payload={
484
- "content": chunk,
485
- "source_name": exam_file.split('/')[-1],
486
- "source_type": "exam",
487
- "chunk_index": i
 
488
  }
489
  ))
490
 
491
  qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
492
  st.success(f"βœ… Uploaded {len(points)} vectors!")
493
-
494
- except Exception as e:
495
- st.error(f"Error: {e}")
496
-
497
- # ========================================================================
498
- # HANDWRITTEN ANSWERS (AI OCR)
499
- # ========================================================================
500
-
501
- with tab3:
502
- st.subheader("Process Handwritten Answers (AI OCR)")
503
-
504
- st.warning("⚠️ This uses Claude Vision - costs ~$0.05-0.10 per PDF page")
505
-
506
- if st.button("πŸ–ŠοΈ List Answer Files"):
507
- answer_files = list_dataset_files("answers/")
508
-
509
- if answer_files:
510
- st.write(f"Found {len(answer_files)} answer files:")
511
- for f in answer_files:
512
- st.text(f"β€’ {f}")
513
 
514
- st.session_state.answer_files = answer_files
515
- else:
516
- st.warning("No answers found")
517
-
518
- if 'answer_files' in st.session_state:
519
-
520
- # Get context from books if available
521
- context_books = ""
522
- try:
523
- book_samples = qdrant.scroll(
524
- collection_name=COLLECTION_NAME,
525
- limit=5,
526
- with_payload=True,
527
- with_vectors=False,
528
- scroll_filter={"must": [{"key": "source_type", "match": {"value": "book"}}]}
529
- )
530
-
531
- if book_samples and book_samples[0]:
532
- context_books = "\n".join([p.payload['content'] for p in book_samples[0]])
533
- st.info("βœ… Using book context for better OCR")
534
- except:
535
- st.caption("No books processed yet - OCR will work but may be less accurate")
536
-
537
- if st.button("πŸ€– PROCESS WITH AI OCR", type="primary"):
538
-
539
- total_tokens = 0
540
-
541
- for answer_file in st.session_state.answer_files:
542
-
543
- with st.expander(f"Processing {answer_file}"):
544
-
545
- try:
546
- # Download
547
- local_path = download_file_from_dataset(answer_file)
548
-
549
- # Convert to images
550
- st.write("πŸ–ΌοΈ Converting to images...")
551
- images = pdf_to_images(local_path)
552
-
553
- if not images:
554
- continue
555
-
556
- st.write(f"βœ… {len(images)} pages")
557
-
558
- # OCR each page
559
- transcribed_pages = []
560
- page_tokens = 0
561
-
562
- for page_num, image in enumerate(images, 1):
563
- st.write(f"πŸ€– OCR Page {page_num}/{len(images)}...")
564
-
565
- transcription, tokens = ocr_with_claude(
566
- claude,
567
- image,
568
- context=context_books
569
- )
570
-
571
- if transcription:
572
- transcribed_pages.append(f"\n=== Page {page_num} ===\n\n{transcription}")
573
- page_tokens += tokens
574
-
575
- if not transcribed_pages:
576
- st.error("OCR failed")
577
- continue
578
-
579
- full_text = "\n\n".join(transcribed_pages)
580
- st.success(f"βœ… Transcribed {len(full_text):,} chars")
581
- st.info(f"Tokens: {page_tokens:,} (~${page_tokens * 0.000003:.3f})")
582
- total_tokens += page_tokens
583
-
584
- # Chunk
585
- chunks = chunk_text(full_text)
586
- embeddings = embedder.encode(chunks, show_progress_bar=False)
587
-
588
- # Upload
589
- points = []
590
- for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
591
- points.append(PointStruct(
592
- id=abs(hash(f"{answer_file}_{i}_{time.time()}")) % (2**63),
593
- vector=emb.tolist(),
594
- payload={
595
- "content": chunk,
596
- "source_name": answer_file.split('/')[-1],
597
- "source_type": "answer_handwritten",
598
- "chunk_index": i,
599
- "ocr_tokens": page_tokens
600
- }
601
- ))
602
-
603
- qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
604
- st.success(f"βœ… Uploaded {len(points)} vectors!")
605
-
606
- except Exception as e:
607
- st.error(f"Error: {e}")
608
-
609
- st.success(f"Total tokens: {total_tokens:,} | Cost: ${total_tokens * 0.000003:.2f}")
610
 
611
  # ============================================================================
612
- # MODE 3: STATS
613
  # ============================================================================
614
 
615
- elif mode == "πŸ“Š Stats":
616
 
617
- st.title("πŸ“Š Database Statistics")
618
 
619
- try:
620
- sample = qdrant.scroll(
621
- collection_name=COLLECTION_NAME,
622
- limit=1000,
623
- with_payload=True,
624
- with_vectors=False
 
 
 
 
 
 
 
 
 
 
625
  )
 
 
626
 
627
- if sample and sample[0]:
628
- types = {}
629
- sources = set()
630
-
631
- for point in sample[0]:
632
- src_type = point.payload.get('source_type', 'unknown')
633
- types[src_type] = types.get(src_type, 0) + 1
634
- sources.add(point.payload.get('source_name', 'Unknown'))
635
-
636
- col1, col2 = st.columns(2)
637
-
638
- with col1:
639
- st.metric("Total Vectors", get_vector_count(qdrant))
640
 
641
- with col2:
642
- st.metric("Unique Sources", len(sources))
 
 
 
 
 
 
 
 
 
 
 
643
 
644
- st.subheader("By Type")
645
- for doc_type, count in sorted(types.items()):
646
- st.progress(count / sum(types.values()), text=f"{doc_type}: {count}")
 
 
647
 
648
- st.subheader("Sources")
649
- for src in sorted(sources):
650
- st.caption(f"β€’ {src}")
651
-
652
- except Exception as e:
653
- st.error(f"Error: {e}")
654
-
655
- st.sidebar.caption("πŸŽ“ Math AI v1.0")
 
2
  import os
3
  import time
4
  import base64
5
+ import hashlib
6
  from io import BytesIO
7
  from PIL import Image
8
  import PyPDF2
9
+ from pdf2image import convert_from_path
10
  from anthropic import Anthropic
11
  from qdrant_client import QdrantClient
12
  from qdrant_client.models import Distance, VectorParams, PointStruct
 
14
  from huggingface_hub import hf_hub_download, list_repo_files
15
 
16
  # ============================================================================
17
+ # PRODUCTION MATH AI SYSTEM - SMART PROCESSING
18
  # ============================================================================
19
 
20
  st.set_page_config(
21
+ page_title="Math AI System - Production",
22
  page_icon="πŸŽ“",
23
  layout="wide"
24
  )
25
 
26
  COLLECTION_NAME = "math_knowledge_base"
27
+ DATASET_REPO = "yourusername/math-ai-documents" # ← CHANGE THIS!
28
 
29
+ # ============================================================================
30
+ # AVAILABLE EMBEDDING MODELS
31
+ # ============================================================================
32
+
33
+ EMBEDDING_MODELS = {
34
+ "MiniLM-L6 (Fast, 384D)": {
35
+ "name": "sentence-transformers/all-MiniLM-L6-v2",
36
+ "dimensions": 384,
37
+ "speed": "Fast",
38
+ "quality": "Good"
39
+ },
40
+ "MiniLM-L12 (Balanced, 384D)": {
41
+ "name": "sentence-transformers/all-MiniLM-L12-v2",
42
+ "dimensions": 384,
43
+ "speed": "Medium",
44
+ "quality": "Better"
45
+ },
46
+ "MPNet (Best Quality, 768D)": {
47
+ "name": "sentence-transformers/all-mpnet-base-v2",
48
+ "dimensions": 768,
49
+ "speed": "Slower",
50
+ "quality": "Excellent"
51
+ }
52
+ }
53
 
54
  # ============================================================================
55
  # CACHED RESOURCES
56
  # ============================================================================
57
 
58
  @st.cache_resource
59
+ def get_qdrant_client():
60
+ """Initialize Qdrant client"""
61
+ return QdrantClient(
62
  url=os.getenv("QDRANT_URL"),
63
  api_key=os.getenv("QDRANT_API_KEY")
64
  )
65
+
66
+ @st.cache_resource
67
+ def get_claude_client():
68
+ """Initialize Claude client"""
69
+ return Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
70
+
71
+ @st.cache_resource
72
+ def get_embedding_model(model_name):
73
+ """Load embedding model (cached per model)"""
74
+ return SentenceTransformer(model_name)
75
 
76
  # ============================================================================
77
+ # HELPER FUNCTIONS
78
  # ============================================================================
79
 
80
+ def get_file_hash(file_path):
81
+ """Generate unique hash for file to track if already processed"""
82
+ return hashlib.md5(file_path.encode()).hexdigest()
83
+
84
+ def check_if_processed(qdrant, file_name):
85
+ """Check if file already processed in Qdrant"""
86
+ try:
87
+ results = qdrant.scroll(
88
+ collection_name=COLLECTION_NAME,
89
+ scroll_filter={
90
+ "must": [
91
+ {"key": "source_name", "match": {"value": file_name}}
92
+ ]
93
+ },
94
+ limit=1,
95
+ with_payload=True,
96
+ with_vectors=False
97
+ )
98
+
99
+ return len(results[0]) > 0 if results and results[0] else False
100
+
101
+ except:
102
+ return False
103
+
104
  def list_dataset_files(folder_path):
105
+ """List PDF files in HF Dataset folder"""
106
  try:
 
107
  hf_token = os.getenv("HF_TOKEN")
 
 
108
  all_files = list_repo_files(
109
  repo_id=DATASET_REPO,
110
  repo_type="dataset",
111
  token=hf_token
112
  )
113
 
 
114
  pdf_files = [
115
  f for f in all_files
116
  if f.startswith(folder_path) and f.endswith('.pdf')
 
122
  st.error(f"Error listing files: {e}")
123
  return []
124
 
125
+ def download_from_dataset(file_path):
126
+ """Download file from HF Dataset"""
127
  try:
128
  hf_token = os.getenv("HF_TOKEN")
129
 
 
130
  local_path = hf_hub_download(
131
  repo_id=DATASET_REPO,
132
  filename=file_path,
 
137
  return local_path
138
 
139
  except Exception as e:
140
+ st.error(f"Download error: {e}")
141
  return None
142
 
 
 
 
 
143
  def extract_text_from_pdf(pdf_path):
144
+ """Extract text from typed PDF"""
145
  try:
146
  with open(pdf_path, 'rb') as file:
147
+ reader = PyPDF2.PdfReader(file)
148
  text = ""
149
+ for page_num, page in enumerate(reader.pages):
150
  text += f"\n\n=== Page {page_num + 1} ===\n\n{page.extract_text()}"
151
  return text
152
  except Exception as e:
153
+ st.error(f"Text extraction error: {e}")
154
  return None
155
 
156
  def pdf_to_images(pdf_path):
157
+ """Convert PDF to images for OCR"""
158
  try:
 
159
  images = convert_from_path(pdf_path, dpi=200)
160
  return images
161
  except Exception as e:
162
+ st.error(f"PDF to image error: {e}")
163
+ st.info("πŸ’‘ This requires poppler-utils. Add 'poppler-utils' to packages.txt file in your Space")
164
  return []
165
 
166
  def resize_image(image, max_size=(2048, 2048)):
167
+ """Resize image for Claude Vision"""
168
  image.thumbnail(max_size, Image.Resampling.LANCZOS)
169
  return image
170
 
171
  def image_to_base64(image):
172
+ """Convert PIL Image to base64"""
173
  buffered = BytesIO()
174
  image.save(buffered, format="PNG")
175
  return base64.b64encode(buffered.getvalue()).decode()
 
180
  resized = resize_image(image.copy())
181
  img_b64 = image_to_base64(resized)
182
 
183
+ prompt = f"""Transcribe handwritten math solution.
184
 
185
  STYLE: Italian cursive (connected letters)
186
  LANGUAGE: English
 
191
  1. Transcribe in English
192
  2. Use proper math notation: ∫, βˆ‘, √, βˆ‚, etc.
193
  3. Maintain structure
194
+ 4. Mark unclear: [unclear: guess]
195
 
196
+ OUTPUT: Transcription only."""
197
 
198
  try:
199
  message = claude_client.messages.create(
 
213
  return message.content[0].text, message.usage.input_tokens + message.usage.output_tokens
214
 
215
  except Exception as e:
216
+ st.error(f"OCR error: {e}")
217
  return None, 0
218
 
219
  def chunk_text(text, chunk_size=150, overlap=30):
220
+ """Split text into chunks"""
221
  words = text.split()
222
  chunks = []
223
  for i in range(0, len(words), chunk_size - overlap):
 
227
  return chunks
228
 
229
  def get_vector_count(qdrant):
230
+ """Get total vectors in database"""
231
  try:
232
  count = 0
233
  offset = None
 
250
  return 0
251
 
252
  # ============================================================================
253
+ # INITIALIZE CLIENTS
254
  # ============================================================================
255
 
256
  try:
257
+ qdrant = get_qdrant_client()
258
+ claude = get_claude_client()
259
  st.sidebar.success("βœ… System Ready")
260
  except Exception as e:
261
+ st.error(f"❌ Initialization failed: {e}")
262
+ st.info("Add these secrets: QDRANT_URL, QDRANT_API_KEY, ANTHROPIC_API_KEY, HF_TOKEN")
263
  st.stop()
264
 
265
  # ============================================================================
 
267
  # ============================================================================
268
 
269
  st.sidebar.title("πŸŽ“ Math AI System")
270
+ st.sidebar.caption("Production Version")
 
 
 
 
 
 
 
271
 
272
  try:
273
  vector_count = get_vector_count(qdrant)
274
+ st.sidebar.metric("Total Vectors", f"{vector_count:,}")
275
+
276
+ storage_mb = (vector_count * 384 * 4) / (1024 * 1024)
277
+ st.sidebar.metric("Storage", f"{storage_mb:.1f} MB")
278
  except:
279
+ st.sidebar.warning("Database unavailable")
280
+
281
+ st.sidebar.markdown("---")
282
 
283
  # ============================================================================
284
+ # MAIN TABS (Reordered as requested)
285
  # ============================================================================
286
 
287
+ tab1, tab2, tab3 = st.tabs([
288
+ "πŸ“Š Dataset Manager",
289
+ "πŸ” Search & Solve",
290
+ "πŸ“ˆ Statistics"
291
+ ])
292
+
293
+ # ============================================================================
294
+ # TAB 1: DATASET MANAGER (Primary Interface)
295
+ # ============================================================================
296
+
297
+ with tab1:
298
 
299
+ st.title("πŸ“Š Dataset Manager")
300
+ st.markdown("*Manage all your data sources in one place*")
301
 
302
+ # Check HF Token
303
+ if not os.getenv("HF_TOKEN"):
304
+ st.error("⚠️ Missing HF_TOKEN in secrets!")
305
+ st.info("Add it in Settings β†’ Repository Secrets")
306
+ st.stop()
307
 
308
+ # Collection setup
309
+ st.header("πŸ—οΈ Step 1: Database Setup")
310
 
311
+ col1, col2 = st.columns([2, 1])
312
+
313
+ with col1:
314
+ try:
315
+ collections = qdrant.get_collections().collections
316
+ exists = any(c.name == COLLECTION_NAME for c in collections)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
+ if exists:
319
+ st.success(f"βœ… Collection '{COLLECTION_NAME}' exists")
320
+ else:
321
+ st.warning(f"Collection '{COLLECTION_NAME}' doesn't exist")
322
 
323
+ # Show embedding model choice for initial creation
324
+ st.subheader("Choose Embedding Model")
325
 
326
+ for model_name, specs in EMBEDDING_MODELS.items():
327
+ with st.expander(f"{model_name} - {specs['quality']} quality, {specs['speed']} speed"):
328
+ st.write(f"**Dimensions:** {specs['dimensions']}")
329
+ st.write(f"**Model:** `{specs['name']}`")
330
+
331
+ selected_model_key = st.selectbox(
332
+ "Select embedding model:",
333
+ list(EMBEDDING_MODELS.keys())
334
+ )
335
+
336
+ if st.button("πŸ—οΈ Create Collection", type="primary"):
337
+ dimensions = EMBEDDING_MODELS[selected_model_key]["dimensions"]
 
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
+ qdrant.create_collection(
340
+ collection_name=COLLECTION_NAME,
341
+ vectors_config=VectorParams(
342
+ size=dimensions,
343
+ distance=Distance.COSINE
344
+ )
345
  )
346
+
347
+ st.success(f"βœ… Created with {dimensions}D vectors!")
348
+ st.session_state.embedding_model = EMBEDDING_MODELS[selected_model_key]["name"]
349
+ st.rerun()
350
+
351
+ except Exception as e:
352
+ st.error(f"Error: {e}")
 
 
353
 
354
+ with col2:
355
+ st.info(f"""
356
+ **Dataset:**
357
+ `{DATASET_REPO}`
358
+
359
+ **Collection:**
360
+ `{COLLECTION_NAME}`
361
+ """)
362
 
363
+ st.markdown("---")
 
364
 
365
+ # Processing options
366
+ st.header("βš™οΈ Step 2: Processing Configuration")
 
367
 
368
+ col1, col2, col3 = st.columns(3)
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
+ with col1:
371
+ st.subheader("Chunking Strategy")
372
+ chunk_size = st.slider("Chunk size (words):", 50, 500, 150)
373
+ chunk_overlap = st.slider("Overlap (words):", 0, 100, 30)
374
+ st.caption(f"Overlap: {(chunk_overlap/chunk_size*100):.0f}%")
375
 
376
+ with col2:
377
+ st.subheader("Embedding Model")
378
+ # Get current model from collection or use default
379
+ current_model = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
380
 
381
+ # Find which key this model belongs to
382
+ current_model_key = "MiniLM-L6 (Fast, 384D)"
383
+ for key, specs in EMBEDDING_MODELS.items():
384
+ if specs["name"] == current_model:
385
+ current_model_key = key
386
+ break
387
+
388
+ st.info(f"**Active:** {current_model_key}")
389
+ st.caption(f"Model: `{current_model}`")
390
+
391
+ with col3:
392
+ st.subheader("OCR Settings")
393
+ use_context_for_ocr = st.checkbox("Use book context", value=True, help="Better accuracy, higher cost")
394
+ st.caption("Context helps Claude understand symbols")
395
 
396
  st.markdown("---")
397
 
398
+ # Data sources
399
+ st.header("πŸ“ Step 3: Data Sources")
400
 
401
+ source_tabs = st.tabs([
402
+ "πŸ“‚ Your Dataset Files",
403
+ "🌐 Public Datasets (GSM8K, MATH, etc.)"
404
+ ])
405
 
406
  # ========================================================================
407
+ # SOURCE 1: HF Dataset Files
408
  # ========================================================================
409
 
410
+ with source_tabs[0]:
 
411
 
412
+ st.subheader("Files from Your HF Dataset")
413
+
414
+ folder_type = st.radio(
415
+ "Select folder:",
416
+ ["πŸ“š Books (Typed PDFs)", "πŸ“ Exams (Typed PDFs)", "πŸ–ŠοΈ Answers (Handwritten - needs OCR)"],
417
+ horizontal=True
418
+ )
419
+
420
+ # Determine folder path
421
+ if "Books" in folder_type:
422
+ folder_path = "books/"
423
+ doc_type = "book"
424
+ elif "Exams" in folder_type:
425
+ folder_path = "exams/"
426
+ doc_type = "exam"
427
+ else:
428
+ folder_path = "answers/"
429
+ doc_type = "answer_handwritten"
430
+
431
+ # List files
432
+ if st.button(f"πŸ” Scan {folder_path} folder"):
433
+ with st.spinner("Scanning dataset..."):
434
+ files = list_dataset_files(folder_path)
435
 
436
+ if files:
437
+ # Check processing status for each file
438
+ file_status = []
439
+ for file in files:
440
+ file_name = file.split('/')[-1]
441
+ is_processed = check_if_processed(qdrant, file_name)
442
+ file_status.append({
443
+ "file": file,
444
+ "name": file_name,
445
+ "processed": is_processed
446
+ })
447
+
448
+ st.session_state.current_files = file_status
449
+ st.session_state.current_folder = folder_path
450
+ st.session_state.current_doc_type = doc_type
451
+ else:
452
+ st.warning(f"No files found in {folder_path}")
453
 
454
+ # Display files with status
455
+ if 'current_files' in st.session_state and st.session_state.current_folder == folder_path:
456
+
457
+ st.write(f"**Found {len(st.session_state.current_files)} files:**")
458
+
459
+ # Summary
460
+ processed_count = sum(1 for f in st.session_state.current_files if f['processed'])
461
+ pending_count = len(st.session_state.current_files) - processed_count
462
+
463
+ col1, col2, col3 = st.columns(3)
464
+ with col1:
465
+ st.metric("Total", len(st.session_state.current_files))
466
+ with col2:
467
+ st.metric("βœ… Processed", processed_count)
468
+ with col3:
469
+ st.metric("⏳ Pending", pending_count)
470
+
471
+ # File list with checkboxes
472
+ st.subheader("Select files to process:")
473
+
474
+ selected_files = []
475
 
476
+ for file_info in st.session_state.current_files:
477
+ col1, col2 = st.columns([3, 1])
478
+
479
+ with col1:
480
+ # Only allow selection if not processed
481
+ if file_info['processed']:
482
+ st.checkbox(
483
+ f"βœ… {file_info['name']} (Already processed)",
484
+ value=False,
485
+ disabled=True,
486
+ key=f"file_{file_info['name']}"
487
+ )
488
+ else:
489
+ if st.checkbox(
490
+ f"⏳ {file_info['name']}",
491
+ value=True, # Auto-select pending files
492
+ key=f"file_{file_info['name']}"
493
+ ):
494
+ selected_files.append(file_info)
495
+
496
+ with col2:
497
+ if file_info['processed']:
498
+ st.caption("Skip")
499
+ else:
500
+ st.caption("Ready")
501
+
502
+ # Process button
503
+ if selected_files:
504
+
505
+ st.markdown("---")
506
+ st.write(f"**Ready to process {len(selected_files)} file(s)**")
507
 
508
+ # Show cost estimate for OCR
509
+ if doc_type == "answer_handwritten":
510
+ est_pages = len(selected_files) * 5 # Assume 5 pages per PDF
511
+ est_cost = est_pages * 0.08
512
+ st.warning(f"⚠️ OCR Cost Estimate: ~${est_cost:.2f} ({est_pages} pages Γ— ~$0.08/page)")
513
+
514
+ if st.button(f"πŸš€ PROCESS SELECTED FILES", type="primary"):
515
 
516
+ # Load embedding model
517
+ embedder = get_embedding_model(current_model)
518
+
519
+ # Get context if needed
520
+ context_books = ""
521
+ if doc_type == "answer_handwritten" and use_context_for_ocr:
522
+ try:
523
+ book_samples = qdrant.scroll(
524
+ collection_name=COLLECTION_NAME,
525
+ limit=10,
526
+ with_payload=True,
527
+ with_vectors=False,
528
+ scroll_filter={"must": [{"key": "source_type", "match": {"value": "book"}}]}
529
+ )
530
+
531
+ if book_samples and book_samples[0]:
532
+ context_books = "\n".join([p.payload['content'] for p in book_samples[0][:5]])
533
+ st.info("βœ… Using book context for OCR")
534
+ except:
535
+ st.caption("No books in database - OCR will work but may be less accurate")
536
+
537
+ # Process each selected file
538
+ total_tokens = 0
539
+ total_vectors = 0
540
+
541
+ for file_info in selected_files:
 
 
 
 
 
 
 
 
 
 
 
542
 
543
+ with st.expander(f"Processing {file_info['name']}", expanded=True):
544
+
545
+ try:
546
+ # Download
547
+ st.write("πŸ“₯ Downloading...")
548
+ local_path = download_from_dataset(file_info['file'])
549
+
550
+ if not local_path:
551
+ st.error("Download failed")
552
+ continue
553
+
554
+ # Extract or OCR
555
+ if doc_type == "answer_handwritten":
556
+ # OCR path
557
+ st.write("πŸ–ΌοΈ Converting to images...")
558
+ images = pdf_to_images(local_path)
559
+
560
+ if not images:
561
+ st.error("Conversion failed - poppler-utils not installed?")
562
+ continue
563
+
564
+ st.write(f"βœ… {len(images)} pages")
565
+
566
+ # OCR each page
567
+ transcribed_pages = []
568
+ page_tokens = 0
569
+
570
+ for page_num, image in enumerate(images, 1):
571
+ st.write(f"πŸ€– OCR page {page_num}/{len(images)}...")
572
+
573
+ transcription, tokens = ocr_with_claude(
574
+ claude,
575
+ image,
576
+ context=context_books
577
+ )
578
+
579
+ if transcription:
580
+ transcribed_pages.append(f"\n=== Page {page_num} ===\n\n{transcription}")
581
+ page_tokens += tokens
582
+
583
+ if not transcribed_pages:
584
+ st.error("OCR failed")
585
+ continue
586
+
587
+ text = "\n\n".join(transcribed_pages)
588
+ total_tokens += page_tokens
589
+
590
+ st.success(f"βœ… Transcribed {len(text):,} chars (${page_tokens * 0.000003:.3f})")
591
+
592
+ else:
593
+ # Text extraction
594
+ st.write("πŸ“– Extracting text...")
595
+ text = extract_text_from_pdf(local_path)
596
+
597
+ if not text:
598
+ st.error("Text extraction failed")
599
+ continue
600
+
601
+ st.write(f"βœ… {len(text):,} chars")
602
+
603
+ # Chunk
604
+ chunks = chunk_text(text, chunk_size, chunk_overlap)
605
+ st.write(f"βœ‚οΈ {len(chunks)} chunks")
606
+
607
+ # Embed
608
+ st.write("πŸ”’ Embedding...")
609
+ embeddings = embedder.encode(chunks, show_progress_bar=False)
610
+
611
+ # Upload
612
+ points = []
613
+ for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
614
+ points.append(PointStruct(
615
+ id=abs(hash(f"{file_info['file']}_{i}_{time.time()}")) % (2**63),
616
+ vector=emb.tolist(),
617
+ payload={
618
+ "content": chunk,
619
+ "source_name": file_info['name'],
620
+ "source_type": doc_type,
621
+ "chunk_index": i,
622
+ "embedding_model": current_model
623
+ }
624
+ ))
625
+
626
+ qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
627
+ total_vectors += len(points)
628
+
629
+ st.success(f"βœ… Uploaded {len(points)} vectors!")
630
+
631
+ except Exception as e:
632
+ st.error(f"Error: {e}")
633
+
634
+ # Summary
635
+ st.balloons()
636
+ st.success(f"""
637
+ πŸŽ‰ Processing Complete!
638
+
639
+ - Files processed: {len(selected_files)}
640
+ - Vectors added: {total_vectors:,}
641
+ - OCR tokens used: {total_tokens:,}
642
+ - OCR cost: ${total_tokens * 0.000003:.2f}
643
+ """)
644
 
645
+ # Clear selection
646
+ st.session_state.pop('current_files', None)
647
+ st.rerun()
648
 
649
  # ========================================================================
650
+ # SOURCE 2: Public Datasets
651
  # ========================================================================
652
 
653
+ with source_tabs[1]:
 
654
 
655
+ st.subheader("Public Math Datasets")
 
 
 
 
 
 
 
 
 
 
656
 
657
+ dataset_choice = st.selectbox(
658
+ "Select dataset:",
659
+ [
660
+ "GSM8K - Grade School Math (8.5K problems)",
661
+ "MATH - Competition Math (12.5K problems)",
662
+ "MathQA - Math Word Problems (37K problems)"
663
+ ]
664
+ )
665
+
666
+ sample_size = st.slider("Number of samples:", 10, 2000, 100)
667
+
668
+ # Check if already loaded
669
+ dataset_name = dataset_choice.split(" - ")[0]
670
+ already_loaded = check_if_processed(qdrant, dataset_name)
671
+
672
+ if already_loaded:
673
+ st.success(f"βœ… {dataset_name} already loaded!")
674
+ st.info("Vectors from this dataset are already in your database.")
675
+ else:
676
+ if st.button(f"πŸ“₯ Load {dataset_name}", type="primary"):
677
 
678
+ try:
679
+ from datasets import load_dataset
680
 
681
+ embedder = get_embedding_model(current_model)
682
+
683
+ with st.spinner(f"Loading {dataset_name}..."):
684
+
685
+ if "GSM8K" in dataset_choice:
686
+ dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
687
+ texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
688
+ for i in range(min(sample_size, len(dataset)))]
689
 
690
+ elif "MATH" in dataset_choice:
691
+ dataset = load_dataset("lighteval/MATH", split="train", trust_remote_code=True)
692
+ texts = [f"Problem: {dataset[i].get('problem', '')}\n\nSolution: {dataset[i].get('solution', '')}"
693
+ for i in range(min(sample_size, len(dataset)))]
694
 
695
+ else: # MathQA
696
+ dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
697
+ texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
698
+ for i in range(min(sample_size, len(dataset)))]
699
 
700
+ st.write(f"βœ… Loaded {len(texts)} problems")
701
+
702
+ # Embed
703
+ st.write("πŸ”’ Embedding...")
704
+ embeddings = embedder.encode(texts, show_progress_bar=True)
705
 
706
+ # Upload
707
  points = []
708
+ for i, (text, emb) in enumerate(zip(texts, embeddings)):
709
  points.append(PointStruct(
710
+ id=abs(hash(f"{dataset_name}_{i}_{time.time()}")) % (2**63),
711
  vector=emb.tolist(),
712
  payload={
713
+ "content": text[:2000],
714
+ "source_name": dataset_name,
715
+ "source_type": "public_dataset",
716
+ "index": i,
717
+ "embedding_model": current_model
718
  }
719
  ))
720
 
721
  qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
722
  st.success(f"βœ… Uploaded {len(points)} vectors!")
723
+ st.balloons()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
724
 
725
+ except Exception as e:
726
+ st.error(f"Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
 
728
  # ============================================================================
729
+ # TAB 2: SEARCH & SOLVE
730
  # ============================================================================
731
 
732
+ with tab2:
733
 
734
+ st.title("πŸ” Search & Solve")
735
 
736
+ problem = st.text_area(
737
+ "Enter math problem:",
738
+ placeholder="Find the gradient of the loss function L(w) = (1/2)||Xw - y||Β²",
739
+ height=150
740
+ )
741
+
742
+ col1, col2 = st.columns(2)
743
+
744
+ with col1:
745
+ top_k = st.slider("Retrieve top:", 3, 20, 5)
746
+
747
+ with col2:
748
+ detail = st.select_slider(
749
+ "Detail level:",
750
+ ["Concise", "Standard", "Detailed", "Exhaustive"],
751
+ value="Detailed"
752
  )
753
+
754
+ if st.button("πŸš€ SOLVE", type="primary") and problem:
755
 
756
+ # Get embedding model
757
+ current_model = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
758
+ embedder = get_embedding_model(current_model)
759
+
760
+ with st.spinner("Searching..."):
761
+ query_emb = embedder.encode(problem)
 
 
 
 
 
 
 
762
 
763
+ try:
764
+ results = qdrant.search(
765
+ collection_name=COLLECTION_NAME,
766
+ query_vector=query_emb.tolist(),
767
+ limit=top_k
768
+ )
769
+ except:
770
+ results = []
771
+
772
+ if not results:
773
+ st.warning("No results. Load data in Dataset Manager.")
774
+ else:
775
+ st.success(f"Found {len(results)} references!")
776
 
777
+ with st.expander("πŸ“š References"):
778
+ for i, r in enumerate(results, 1):
779
+ st.markdown(f"**{i}.** ({r.score*100:.0f}% match)")
780
+ st.text(r.payload['content'][:200] + "...")
781
+ st.caption(f"Source: {r.payload.get('source_name')}")
782
 
783
+ with st.spinner("Generating solution..."):
784
+
785
+ context = "\n\n".join([r