Hebaelsayed commited on
Commit
d9f0bf7
Β·
verified Β·
1 Parent(s): 06c1259

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +367 -458
src/streamlit_app.py CHANGED
@@ -10,107 +10,144 @@ from anthropic import Anthropic
10
  from qdrant_client import QdrantClient
11
  from qdrant_client.models import Distance, VectorParams, PointStruct
12
  from sentence_transformers import SentenceTransformer
 
13
 
14
  # ============================================================================
15
- # COMPLETE MATH AI SYSTEM - 100% HUGGING FACE
16
  # ============================================================================
17
 
18
  st.set_page_config(
19
  page_title="Math AI System",
20
  page_icon="πŸŽ“",
21
- layout="wide",
22
- initial_sidebar_state="expanded"
23
  )
24
 
25
  COLLECTION_NAME = "math_knowledge_base"
26
 
 
 
 
27
  # ============================================================================
28
  # CACHED RESOURCES
29
  # ============================================================================
30
 
31
  @st.cache_resource
32
  def get_clients():
33
- """Initialize all clients - cached"""
34
  qdrant = QdrantClient(
35
  url=os.getenv("QDRANT_URL"),
36
  api_key=os.getenv("QDRANT_API_KEY")
37
  )
38
-
39
  claude = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
40
-
41
  embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
42
-
43
  return qdrant, claude, embedder
44
 
45
  # ============================================================================
46
- # HELPER FUNCTIONS
47
  # ============================================================================
48
 
49
- def extract_text_from_pdf(pdf_file):
50
- """Extract text from typed PDF"""
51
  try:
52
- pdf_reader = PyPDF2.PdfReader(pdf_file)
53
- text = ""
54
- for page_num, page in enumerate(pdf_reader.pages):
55
- text += f"\n\n=== Page {page_num + 1} ===\n\n{page.extract_text()}"
56
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  return None
59
 
60
- def pdf_to_images(pdf_bytes):
61
- """Convert PDF pages to images for OCR"""
 
 
 
 
62
  try:
63
- images = convert_from_bytes(pdf_bytes.read(), dpi=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  return images
65
  except Exception as e:
66
- st.error(f"PDF to image conversion error: {e}")
67
  return []
68
 
69
  def resize_image(image, max_size=(2048, 2048)):
70
- """Resize image for Claude Vision"""
71
  image.thumbnail(max_size, Image.Resampling.LANCZOS)
72
  return image
73
 
74
  def image_to_base64(image):
75
- """Convert PIL Image to base64"""
76
  buffered = BytesIO()
77
  image.save(buffered, format="PNG")
78
  return base64.b64encode(buffered.getvalue()).decode()
79
 
80
- def ocr_with_claude(claude_client, image, context_books="", context_exam=""):
81
- """
82
- AI-powered OCR for handwritten Italian cursive math notes
83
-
84
- NOTE: Italian cursive is the HANDWRITING STYLE (connected letters)
85
- Language is ENGLISH
86
- """
87
 
88
  resized = resize_image(image.copy())
89
  img_b64 = image_to_base64(resized)
90
 
91
- prompt = f"""You are an expert in transcribing handwritten mathematical solutions.
92
-
93
- IMPORTANT: This is written in ITALIAN CURSIVE style (connected, flowing letters), but the LANGUAGE IS ENGLISH.
94
 
95
- CONTEXT FROM TEXTBOOKS (helps understand symbols):
96
- {context_books[:2000] if context_books else "No context available"}
97
 
98
- EXAM QUESTION (helps understand what's being solved):
99
- {context_exam[:1000] if context_exam else "No exam question available"}
100
-
101
- TASK: Transcribe this handwritten math solution into clean, readable text.
102
 
103
  INSTRUCTIONS:
104
- 1. Language is ENGLISH (just cursive style is Italian)
105
- 2. Convert math notation properly:
106
- - Use standard symbols: ∫, βˆ‘, √, βˆ‚, lim, etc.
107
- - Use LaTeX for complex formulas
108
- - Preserve Greek letters: Ξ±, Ξ², Ξ³, Ο€, etc.
109
- 3. Maintain structure (paragraphs, steps)
110
- 4. If unclear, mark as [unclear: best guess]
111
- 5. Describe diagrams as [DIAGRAM: description]
112
 
113
- OUTPUT: Just the transcribed text, no preamble."""
114
 
115
  try:
116
  message = claude_client.messages.create(
@@ -120,34 +157,20 @@ OUTPUT: Just the transcribed text, no preamble."""
120
  {
121
  "role": "user",
122
  "content": [
123
- {
124
- "type": "image",
125
- "source": {
126
- "type": "base64",
127
- "media_type": "image/png",
128
- "data": img_b64
129
- }
130
- },
131
- {
132
- "type": "text",
133
- "text": prompt
134
- }
135
  ]
136
  }
137
  ]
138
  )
139
 
140
- transcription = message.content[0].text
141
- tokens = message.usage.input_tokens + message.usage.output_tokens
142
-
143
- return transcription, tokens
144
 
145
  except Exception as e:
146
- st.error(f"OCR error: {e}")
147
  return None, 0
148
 
149
  def chunk_text(text, chunk_size=150, overlap=30):
150
- """Split text into chunks"""
151
  words = text.split()
152
  chunks = []
153
  for i in range(0, len(words), chunk_size - overlap):
@@ -157,7 +180,7 @@ def chunk_text(text, chunk_size=150, overlap=30):
157
  return chunks
158
 
159
  def get_vector_count(qdrant):
160
- """Get total vectors in database"""
161
  try:
162
  count = 0
163
  offset = None
@@ -187,8 +210,8 @@ try:
187
  qdrant, claude, embedder = get_clients()
188
  st.sidebar.success("βœ… System Ready")
189
  except Exception as e:
190
- st.error(f"❌ Initialization failed: {e}")
191
- st.info("Add QDRANT_URL, QDRANT_API_KEY, and ANTHROPIC_API_KEY in Settings β†’ Secrets")
192
  st.stop()
193
 
194
  # ============================================================================
@@ -198,21 +221,18 @@ except Exception as e:
198
  st.sidebar.title("πŸŽ“ Math AI System")
199
 
200
  mode = st.sidebar.radio(
201
- "Select Mode:",
202
- ["πŸ” Search & Solve", "πŸ—οΈ Setup Database", "πŸ§ͺ Testing Dashboard"],
203
  index=0
204
  )
205
 
206
  st.sidebar.markdown("---")
207
 
208
- # Database stats
209
  try:
210
  vector_count = get_vector_count(qdrant)
211
- st.sidebar.metric("Vectors in DB", f"{vector_count:,}")
212
- storage_mb = (vector_count * 384 * 4) / (1024 * 1024)
213
- st.sidebar.metric("Storage Used", f"{storage_mb:.1f} MB")
214
  except:
215
- st.sidebar.warning("Database not accessible")
216
 
217
  # ============================================================================
218
  # MODE 1: SEARCH & SOLVE
@@ -221,125 +241,58 @@ except:
221
  if mode == "πŸ” Search & Solve":
222
 
223
  st.title("πŸ” Math Problem Solver")
224
- st.markdown("*Search your knowledge base and get detailed solutions*")
225
-
226
- # Input
227
- st.header("πŸ“ Input Problem")
228
 
229
- input_method = st.radio(
230
- "How to input:",
231
- ["✍️ Type Question", "πŸ“„ Upload Exam PDF"],
232
- horizontal=True
233
  )
234
 
235
- problem = None
236
-
237
- if input_method == "✍️ Type Question":
238
- problem = st.text_area(
239
- "Enter math problem:",
240
- placeholder="Example: Find the gradient of L(w) = (1/2)||Xw - y||Β²",
241
- height=150
242
- )
243
- else:
244
- uploaded_exam = st.file_uploader("Upload exam PDF:", type=['pdf'])
245
- if uploaded_exam:
246
- exam_text = extract_text_from_pdf(uploaded_exam)
247
- if exam_text:
248
- st.text_area("Extracted:", exam_text[:1000], height=200)
249
- problem = st.text_input("Specific question or use full text")
250
-
251
- # Settings
252
- with st.expander("βš™οΈ Advanced Settings"):
253
- col1, col2 = st.columns(2)
254
-
255
- with col1:
256
- search_filter = st.multiselect(
257
- "Search in:",
258
- ["Books", "Exams", "Handwritten Solutions", "Public Datasets"],
259
- default=["Books", "Exams", "Handwritten Solutions"]
260
- )
261
-
262
- with col2:
263
- top_k = st.slider("Retrieve top:", 3, 20, 5)
264
- detail_level = st.select_slider(
265
- "Detail level:",
266
- ["Concise", "Standard", "Detailed", "Very Detailed"],
267
- value="Detailed"
268
- )
269
 
270
- # Solve
271
- if st.button("πŸš€ SOLVE PROBLEM", type="primary") and problem:
272
 
273
- with st.spinner("πŸ” Searching..."):
274
-
275
- query_embedding = embedder.encode(problem)
276
 
277
  try:
278
  results = qdrant.search(
279
  collection_name=COLLECTION_NAME,
280
- query_vector=query_embedding.tolist(),
281
  limit=top_k
282
  )
283
- except Exception as e:
284
- st.error(f"Search failed: {e}")
285
  results = []
286
 
287
  if not results:
288
- st.warning("No relevant context found. Load data in Setup mode.")
289
  else:
290
- st.success(f"βœ… Found {len(results)} references!")
291
 
292
- # Show context
293
- with st.expander("πŸ“š Retrieved References"):
294
- for i, result in enumerate(results, 1):
295
- similarity = result.score * 100
296
- st.markdown(f"**Reference {i}** ({similarity:.1f}% relevant)")
297
- st.info(result.payload['content'][:300] + "...")
298
- st.caption(f"Source: {result.payload.get('source_name', 'Unknown')}")
299
- st.markdown("---")
300
 
301
- # Generate solution
302
- with st.spinner("πŸ€– Generating solution..."):
303
 
304
- context = "\n\n".join([
305
- f"[Reference {i+1} from {r.payload.get('source_name')}]:\n{r.payload['content']}"
306
- for i, r in enumerate(results)
307
- ])
308
 
309
- detail_instructions = {
310
- "Concise": "Brief solution, key steps only.",
311
- "Standard": "Clear solution with main steps.",
312
- "Detailed": "Comprehensive solution with detailed explanations.",
313
- "Very Detailed": "Exhaustive solution with all steps and intuitions."
314
- }
315
-
316
- prompt = f"""You are an expert mathematics tutor for machine learning.
317
-
318
- PROBLEM:
319
- {problem}
320
-
321
- REFERENCES (from student's materials):
322
- {context}
323
 
324
- TASK: Solve providing a complete educational solution.
325
 
326
- {detail_instructions[detail_level]}
327
 
328
  FORMAT:
329
-
330
  ## SOLUTION
331
- [Step-by-step solution with clear notation]
332
 
333
- ## REASONING & APPROACH
334
- [WHY this approach, what concepts, how references helped]
335
 
336
  ## REFERENCES USED
337
- [Which references used and HOW each contributed]
338
-
339
- ## VERIFICATION
340
- [How to verify the solution]
341
-
342
- Use proper notation (LaTeX if needed). Reference the materials when explaining."""
343
 
344
  try:
345
  message = claude.messages.create(
@@ -348,116 +301,133 @@ Use proper notation (LaTeX if needed). Reference the materials when explaining."
348
  messages=[{"role": "user", "content": prompt}]
349
  )
350
 
351
- solution = message.content[0].text
352
-
353
  st.markdown("---")
354
- st.markdown(solution)
355
 
356
  st.download_button(
357
- "πŸ“₯ Download Solution",
358
- solution,
359
- file_name=f"solution_{int(time.time())}.md",
360
- mime="text/markdown"
361
  )
362
-
363
- with st.expander("πŸ“Š API Usage"):
364
- st.json({
365
- "input_tokens": message.usage.input_tokens,
366
- "output_tokens": message.usage.output_tokens,
367
- "cost": f"${(message.usage.input_tokens * 0.000003 + message.usage.output_tokens * 0.000015):.4f}"
368
- })
369
 
370
  except Exception as e:
371
  st.error(f"Error: {e}")
372
 
373
  # ============================================================================
374
- # MODE 2: SETUP DATABASE
375
  # ============================================================================
376
 
377
- elif mode == "πŸ—οΈ Setup Database":
378
 
379
- st.title("πŸ—οΈ Database Setup")
380
- st.markdown("*Upload and process your documents*")
381
 
382
- # Create collection
383
- st.header("Step 1: Create Collection")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
 
385
  try:
386
  collections = qdrant.get_collections().collections
387
  exists = any(c.name == COLLECTION_NAME for c in collections)
388
 
389
  if exists:
390
- st.success(f"βœ… Collection '{COLLECTION_NAME}' exists")
391
  else:
392
- if st.button("πŸ—οΈ Create Collection"):
393
  qdrant.create_collection(
394
  collection_name=COLLECTION_NAME,
395
  vectors_config=VectorParams(size=384, distance=Distance.COSINE)
396
  )
397
- st.success("βœ… Created!")
398
  st.rerun()
399
  except Exception as e:
400
  st.error(f"Error: {e}")
401
 
402
  st.markdown("---")
403
 
404
- # Upload documents
405
- st.header("Step 2: Upload Documents")
406
 
407
- tab1, tab2, tab3 = st.tabs([
408
- "πŸ“š Books & Exams (Typed PDFs)",
409
- "πŸ–ŠοΈ Handwritten Solutions (OCR)",
410
- "πŸ“Š Public Datasets"
411
- ])
412
 
413
  # ========================================================================
414
- # TAB 1: Typed PDFs
415
  # ========================================================================
416
 
417
  with tab1:
418
- st.info("βœ… Upload your typed PDFs (books, exams) here")
419
-
420
- uploaded_files = st.file_uploader(
421
- "Choose PDF files:",
422
- type=['pdf'],
423
- accept_multiple_files=True,
424
- key="typed_pdfs"
425
- )
426
 
427
- doc_type = st.selectbox("Document type:", ["book", "exam", "reference"])
 
 
 
 
 
 
 
 
 
 
428
 
429
- if uploaded_files and st.button("πŸ“€ Process & Upload", key="upload_typed"):
430
 
431
- for uploaded_file in uploaded_files:
432
- with st.expander(f"Processing {uploaded_file.name}"):
 
433
 
434
  try:
 
 
 
 
 
 
 
435
  # Extract
436
- text = extract_text_from_pdf(uploaded_file)
 
 
437
  if not text:
438
- st.error("Text extraction failed")
439
  continue
440
 
441
- st.write(f"βœ… Extracted {len(text):,} chars")
442
 
443
  # Chunk
444
  chunks = chunk_text(text)
445
- st.write(f"βœ… Created {len(chunks)} chunks")
446
 
447
  # Embed
448
- with st.spinner("Embedding..."):
449
- embeddings = embedder.encode(chunks, show_progress_bar=False)
450
 
451
  # Upload
452
  points = []
453
  for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
454
  points.append(PointStruct(
455
- id=abs(hash(f"{uploaded_file.name}_{i}_{time.time()}")) % (2**63),
456
  vector=emb.tolist(),
457
  payload={
458
  "content": chunk,
459
- "source_name": uploaded_file.name,
460
- "source_type": doc_type,
461
  "chunk_index": i
462
  }
463
  ))
@@ -469,278 +439,217 @@ elif mode == "πŸ—οΈ Setup Database":
469
  st.error(f"Error: {e}")
470
 
471
  # ========================================================================
472
- # TAB 2: Handwritten OCR (100% IN HF SPACES!)
473
  # ========================================================================
474
 
475
  with tab2:
476
- st.success("βœ… AI-POWERED OCR - Process handwritten notes RIGHT HERE!")
477
 
478
- st.markdown("""
479
- ### How it works:
480
- 1. Upload handwritten solution PDFs (from your Google Drive)
481
- 2. AI OCR processes each page with Claude Vision
482
- 3. Uses your books/exams as context for better accuracy
483
- 4. Uploads transcribed text to database
 
 
 
 
 
484
 
485
- **Cost:** ~$0.05-0.10 per handwritten PDF page
486
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
 
488
- # Upload handwritten PDFs
489
- handwritten_files = st.file_uploader(
490
- "Upload handwritten solution PDFs:",
491
- type=['pdf'],
492
- accept_multiple_files=True,
493
- key="handwritten_pdfs",
494
- help="Your answer PDFs from Google Drive/Math_AI_Documents/answers/"
495
- )
496
 
497
- # Optional: Context from books
498
- context_books = ""
499
- use_context = st.checkbox("Use book context for better OCR accuracy", value=True)
 
 
 
 
 
 
 
 
500
 
501
- if use_context:
502
- # Get some book context from database
 
 
503
  try:
504
  book_samples = qdrant.scroll(
505
  collection_name=COLLECTION_NAME,
506
- limit=10,
507
  with_payload=True,
508
  with_vectors=False,
509
  scroll_filter={"must": [{"key": "source_type", "match": {"value": "book"}}]}
510
  )
511
 
512
  if book_samples and book_samples[0]:
513
- context_books = "\n".join([p.payload['content'] for p in book_samples[0][:5]])
514
- st.caption(f"βœ… Using {len(book_samples[0])} book excerpts as context")
515
  except:
516
- st.caption("⚠️ No books in database yet. OCR will work but may be less accurate.")
517
-
518
- if handwritten_files and st.button("πŸ€– PROCESS WITH AI OCR", type="primary"):
519
-
520
- total_tokens = 0
521
 
522
- for uploaded_file in handwritten_files:
523
- st.markdown(f"### Processing: {uploaded_file.name}")
524
 
525
- try:
526
- # Convert PDF to images
527
- with st.spinner("Converting PDF to images..."):
528
- # Read bytes
529
- pdf_bytes = BytesIO(uploaded_file.read())
530
- images = pdf_to_images(pdf_bytes)
531
-
532
- if not images:
533
- st.error("PDF conversion failed")
534
- continue
535
-
536
- st.write(f"βœ… Converted to {len(images)} pages")
537
-
538
- # OCR each page
539
- transcribed_pages = []
540
- page_tokens = 0
541
 
542
- for page_num, image in enumerate(images, 1):
543
- with st.spinner(f"OCR Page {page_num}/{len(images)}..."):
 
 
 
544
 
545
- transcription, tokens = ocr_with_claude(
546
- claude,
547
- image,
548
- context_books=context_books,
549
- context_exam=""
550
- )
551
 
552
- if transcription:
553
- transcribed_pages.append(f"\n=== Page {page_num} ===\n\n{transcription}")
554
- page_tokens += tokens
555
- st.write(f" βœ… Page {page_num} ({tokens:,} tokens)")
556
- else:
557
- st.write(f" ❌ Page {page_num} failed")
558
-
559
- if not transcribed_pages:
560
- st.error("No pages transcribed successfully")
561
- continue
562
-
563
- # Combine all pages
564
- full_text = "\n\n".join(transcribed_pages)
565
- st.success(f"βœ… Transcribed {len(full_text):,} characters")
566
- st.info(f"πŸ“Š Tokens used: {page_tokens:,} (~${page_tokens * 0.000003:.3f})")
567
- total_tokens += page_tokens
568
-
569
- # Show preview
570
- with st.expander("πŸ‘οΈ Preview transcription"):
571
- st.text(full_text[:500] + "...")
572
-
573
- # Chunk
574
- chunks = chunk_text(full_text)
575
- st.write(f"βœ… Created {len(chunks)} chunks")
576
-
577
- # Embed
578
- with st.spinner("Embedding..."):
579
- embeddings = embedder.encode(chunks, show_progress_bar=False)
580
-
581
- # Upload
582
- points = []
583
- for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
584
- points.append(PointStruct(
585
- id=abs(hash(f"handwritten_{uploaded_file.name}_{i}_{time.time()}")) % (2**63),
586
- vector=emb.tolist(),
587
- payload={
588
- "content": chunk,
589
- "source_name": uploaded_file.name,
590
- "source_type": "answer_handwritten",
591
- "chunk_index": i,
592
- "handwriting_style": "italian_cursive",
593
- "language": "english",
594
- "ocr_method": "claude_vision",
595
- "tokens_used": page_tokens
596
- }
597
- ))
598
-
599
- qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
600
- st.success(f"πŸŽ‰ Uploaded {len(points)} vectors from handwritten notes!")
601
- st.balloons()
602
-
603
- except Exception as e:
604
- st.error(f"Error: {e}")
605
- st.exception(e)
606
-
607
- st.markdown("---")
608
- st.success(f"βœ… Total tokens used: {total_tokens:,}")
609
- st.info(f"πŸ’° Estimated total cost: ${total_tokens * 0.000003:.2f}")
610
-
611
- # ========================================================================
612
- # TAB 3: Public Datasets
613
- # ========================================================================
614
-
615
- with tab3:
616
- st.info("Load pre-built math datasets")
617
-
618
- dataset_choice = st.selectbox(
619
- "Choose dataset:",
620
- ["GSM8K - Grade School Math",
621
- "MATH - Competition Math",
622
- "MathQA - Word Problems"]
623
- )
624
-
625
- sample_size = st.slider("Samples:", 10, 1000, 100)
626
-
627
- if st.button("πŸ“₯ Load Dataset"):
628
- try:
629
- from datasets import load_dataset
630
 
631
- with st.spinner(f"Loading..."):
632
-
633
- if "GSM8K" in dataset_choice:
634
- dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
635
- texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
636
- for i in range(min(sample_size, len(dataset)))]
637
- name = "GSM8K"
638
-
639
- elif "MATH" in dataset_choice:
640
- dataset = load_dataset("lighteval/MATH", split="train", trust_remote_code=True)
641
- texts = [f"Problem: {dataset[i].get('problem', '')}\n\nSolution: {dataset[i].get('solution', '')}"
642
- for i in range(min(sample_size, len(dataset)))]
643
- name = "MATH"
644
-
645
- else:
646
- dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
647
- texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
648
- for i in range(min(sample_size, len(dataset)))]
649
- name = "MathQA"
650
-
651
- st.write(f"βœ… Loaded {len(texts)} problems")
652
-
653
- # Embed
654
- embeddings = embedder.encode(texts, show_progress_bar=True)
655
-
656
- # Upload
657
- points = []
658
- for i, (text, emb) in enumerate(zip(texts, embeddings)):
659
- points.append(PointStruct(
660
- id=abs(hash(f"{name}_{i}_{time.time()}")) % (2**63),
661
- vector=emb.tolist(),
662
- payload={
663
- "content": text[:2000],
664
- "source_name": name,
665
- "source_type": "public_dataset",
666
- "index": i
667
- }
668
- ))
669
-
670
- qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
671
- st.success(f"βœ… Uploaded {len(points)} vectors!")
672
- st.balloons()
673
-
674
- except Exception as e:
675
- st.error(f"Error: {e}")
676
 
677
  # ============================================================================
678
- # MODE 3: TESTING
679
  # ============================================================================
680
 
681
- elif mode == "πŸ§ͺ Testing Dashboard":
682
-
683
- st.title("πŸ§ͺ Testing Dashboard")
684
 
685
- tab1, tab2 = st.tabs(["πŸ“Š Stats", "🎯 Accuracy"])
686
 
687
- with tab1:
688
- st.header("Database Statistics")
 
 
 
 
 
689
 
690
- try:
691
- sample = qdrant.scroll(
692
- collection_name=COLLECTION_NAME,
693
- limit=1000,
694
- with_payload=True,
695
- with_vectors=False
696
- )
697
 
698
- if sample and sample[0]:
699
- types = {}
700
- sources = set()
701
-
702
- for point in sample[0]:
703
- src_type = point.payload.get('source_type', 'unknown')
704
- types[src_type] = types.get(src_type, 0) + 1
705
- sources.add(point.payload.get('source_name', 'Unknown'))
706
-
707
- col1, col2, col3 = st.columns(3)
708
-
709
- with col1:
710
- st.metric("Total Vectors", get_vector_count(qdrant))
711
-
712
- with col2:
713
- st.metric("Sources", len(sources))
714
-
715
- with col3:
716
- st.metric("Types", len(types))
717
-
718
- st.subheader("By Type")
719
- for doc_type, count in sorted(types.items()):
720
- st.progress(count / sum(types.values()), text=f"{doc_type}: {count}")
721
-
722
- except Exception as e:
723
- st.error(f"Error: {e}")
724
-
725
- with tab2:
726
- st.header("Test Accuracy")
727
-
728
- test_query = st.text_input("Test query:")
729
-
730
- if st.button("Test") and test_query:
731
- query_emb = embedder.encode(test_query)
732
- results = qdrant.search(
733
- collection_name=COLLECTION_NAME,
734
- query_vector=query_emb.tolist(),
735
- limit=5
736
- )
737
 
738
- for i, r in enumerate(results, 1):
739
- similarity = r.score * 100
740
- quality = "🟒" if similarity > 70 else "🟑" if similarity > 50 else "πŸ”΄"
741
- st.markdown(f"**{i}. {quality}** ({similarity:.1f}%)")
742
- st.text(r.payload['content'][:200] + "...")
743
- st.markdown("---")
 
 
 
 
744
 
745
- st.sidebar.markdown("---")
746
  st.sidebar.caption("πŸŽ“ Math AI v1.0")
 
10
  from qdrant_client import QdrantClient
11
  from qdrant_client.models import Distance, VectorParams, PointStruct
12
  from sentence_transformers import SentenceTransformer
13
+ from huggingface_hub import hf_hub_download, list_repo_files
14
 
15
  # ============================================================================
16
+ # MATH AI SYSTEM - READS FROM HF DATASET (PERMANENT STORAGE!)
17
  # ============================================================================
18
 
19
  st.set_page_config(
20
  page_title="Math AI System",
21
  page_icon="πŸŽ“",
22
+ layout="wide"
 
23
  )
24
 
25
  COLLECTION_NAME = "math_knowledge_base"
26
 
27
+ # YOUR DATASET - Change this to your dataset name!
28
+ DATASET_REPO = "YOUR_USERNAME/math-ai-documents" # ← EDIT THIS!
29
+
30
  # ============================================================================
31
  # CACHED RESOURCES
32
  # ============================================================================
33
 
34
  @st.cache_resource
35
  def get_clients():
36
+ """Initialize clients"""
37
  qdrant = QdrantClient(
38
  url=os.getenv("QDRANT_URL"),
39
  api_key=os.getenv("QDRANT_API_KEY")
40
  )
 
41
  claude = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
 
42
  embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
 
43
  return qdrant, claude, embedder
44
 
45
  # ============================================================================
46
+ # DATASET OPERATIONS (Reads from HF Dataset)
47
  # ============================================================================
48
 
49
+ def list_dataset_files(folder_path):
50
+ """List all PDF files in a folder from HF Dataset"""
51
  try:
52
+ # Get HF token from environment
53
+ hf_token = os.getenv("HF_TOKEN")
54
+
55
+ # List all files in the dataset
56
+ all_files = list_repo_files(
57
+ repo_id=DATASET_REPO,
58
+ repo_type="dataset",
59
+ token=hf_token
60
+ )
61
+
62
+ # Filter for PDFs in specific folder
63
+ pdf_files = [
64
+ f for f in all_files
65
+ if f.startswith(folder_path) and f.endswith('.pdf')
66
+ ]
67
+
68
+ return pdf_files
69
+
70
  except Exception as e:
71
+ st.error(f"Error listing files: {e}")
72
+ return []
73
+
74
+ def download_file_from_dataset(file_path):
75
+ """Download a file from HF Dataset"""
76
+ try:
77
+ hf_token = os.getenv("HF_TOKEN")
78
+
79
+ # Download file
80
+ local_path = hf_hub_download(
81
+ repo_id=DATASET_REPO,
82
+ filename=file_path,
83
+ repo_type="dataset",
84
+ token=hf_token
85
+ )
86
+
87
+ return local_path
88
+
89
+ except Exception as e:
90
+ st.error(f"Error downloading {file_path}: {e}")
91
  return None
92
 
93
+ # ============================================================================
94
+ # PROCESSING FUNCTIONS
95
+ # ============================================================================
96
+
97
+ def extract_text_from_pdf(pdf_path):
98
+ """Extract text from PDF file"""
99
  try:
100
+ with open(pdf_path, 'rb') as file:
101
+ pdf_reader = PyPDF2.PdfReader(file)
102
+ text = ""
103
+ for page_num, page in enumerate(pdf_reader.pages):
104
+ text += f"\n\n=== Page {page_num + 1} ===\n\n{page.extract_text()}"
105
+ return text
106
+ except Exception as e:
107
+ st.error(f"PDF extraction error: {e}")
108
+ return None
109
+
110
+ def pdf_to_images(pdf_path):
111
+ """Convert PDF to images"""
112
+ try:
113
+ from pdf2image import convert_from_path
114
+ images = convert_from_path(pdf_path, dpi=200)
115
  return images
116
  except Exception as e:
117
+ st.error(f"Conversion error: {e}")
118
  return []
119
 
120
  def resize_image(image, max_size=(2048, 2048)):
121
+ """Resize for Claude"""
122
  image.thumbnail(max_size, Image.Resampling.LANCZOS)
123
  return image
124
 
125
  def image_to_base64(image):
126
+ """Convert to base64"""
127
  buffered = BytesIO()
128
  image.save(buffered, format="PNG")
129
  return base64.b64encode(buffered.getvalue()).decode()
130
 
131
+ def ocr_with_claude(claude_client, image, context=""):
132
+ """AI OCR with Claude Vision"""
 
 
 
 
 
133
 
134
  resized = resize_image(image.copy())
135
  img_b64 = image_to_base64(resized)
136
 
137
+ prompt = f"""Transcribe this handwritten math solution.
 
 
138
 
139
+ STYLE: Italian cursive (connected letters)
140
+ LANGUAGE: English
141
 
142
+ CONTEXT: {context[:2000] if context else ""}
 
 
 
143
 
144
  INSTRUCTIONS:
145
+ 1. Transcribe in English
146
+ 2. Use proper math notation: ∫, βˆ‘, √, βˆ‚, etc.
147
+ 3. Maintain structure
148
+ 4. Mark unclear parts: [unclear: guess]
 
 
 
 
149
 
150
+ OUTPUT: Just the transcription."""
151
 
152
  try:
153
  message = claude_client.messages.create(
 
157
  {
158
  "role": "user",
159
  "content": [
160
+ {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img_b64}},
161
+ {"type": "text", "text": prompt}
 
 
 
 
 
 
 
 
 
 
162
  ]
163
  }
164
  ]
165
  )
166
 
167
+ return message.content[0].text, message.usage.input_tokens + message.usage.output_tokens
 
 
 
168
 
169
  except Exception as e:
 
170
  return None, 0
171
 
172
  def chunk_text(text, chunk_size=150, overlap=30):
173
+ """Split into chunks"""
174
  words = text.split()
175
  chunks = []
176
  for i in range(0, len(words), chunk_size - overlap):
 
180
  return chunks
181
 
182
  def get_vector_count(qdrant):
183
+ """Get total vectors"""
184
  try:
185
  count = 0
186
  offset = None
 
210
  qdrant, claude, embedder = get_clients()
211
  st.sidebar.success("βœ… System Ready")
212
  except Exception as e:
213
+ st.error(f"❌ Init failed: {e}")
214
+ st.info("Add these in Settings β†’ Secrets: QDRANT_URL, QDRANT_API_KEY, ANTHROPIC_API_KEY, HF_TOKEN")
215
  st.stop()
216
 
217
  # ============================================================================
 
221
  st.sidebar.title("πŸŽ“ Math AI System")
222
 
223
  mode = st.sidebar.radio(
224
+ "Mode:",
225
+ ["πŸ” Search & Solve", "πŸ—οΈ Process Dataset Files", "πŸ“Š Stats"],
226
  index=0
227
  )
228
 
229
  st.sidebar.markdown("---")
230
 
 
231
  try:
232
  vector_count = get_vector_count(qdrant)
233
+ st.sidebar.metric("Vectors", f"{vector_count:,}")
 
 
234
  except:
235
+ pass
236
 
237
  # ============================================================================
238
  # MODE 1: SEARCH & SOLVE
 
241
  if mode == "πŸ” Search & Solve":
242
 
243
  st.title("πŸ” Math Problem Solver")
 
 
 
 
244
 
245
+ problem = st.text_area(
246
+ "Enter problem:",
247
+ placeholder="Find the gradient of L(w) = (1/2)||Xw - y||Β²",
248
+ height=150
249
  )
250
 
251
+ top_k = st.slider("Retrieve:", 3, 20, 5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
+ if st.button("πŸš€ SOLVE", type="primary") and problem:
 
254
 
255
+ with st.spinner("Searching..."):
256
+ query_emb = embedder.encode(problem)
 
257
 
258
  try:
259
  results = qdrant.search(
260
  collection_name=COLLECTION_NAME,
261
+ query_vector=query_emb.tolist(),
262
  limit=top_k
263
  )
264
+ except:
 
265
  results = []
266
 
267
  if not results:
268
+ st.warning("No context found. Process your files in 'Process Dataset Files' mode.")
269
  else:
270
+ st.success(f"Found {len(results)} references!")
271
 
272
+ with st.expander("References"):
273
+ for i, r in enumerate(results, 1):
274
+ st.markdown(f"**{i}.** {r.payload['content'][:200]}...")
275
+ st.caption(f"Source: {r.payload.get('source_name')}")
 
 
 
 
276
 
277
+ with st.spinner("Generating solution..."):
 
278
 
279
+ context = "\n\n".join([r.payload['content'] for r in results])
 
 
 
280
 
281
+ prompt = f"""Solve this problem using the references.
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
+ PROBLEM: {problem}
284
 
285
+ REFERENCES: {context}
286
 
287
  FORMAT:
 
288
  ## SOLUTION
289
+ [Step-by-step]
290
 
291
+ ## REASONING
292
+ [Why this approach]
293
 
294
  ## REFERENCES USED
295
+ [Which sources helped]"""
 
 
 
 
 
296
 
297
  try:
298
  message = claude.messages.create(
 
301
  messages=[{"role": "user", "content": prompt}]
302
  )
303
 
 
 
304
  st.markdown("---")
305
+ st.markdown(message.content[0].text)
306
 
307
  st.download_button(
308
+ "πŸ“₯ Download",
309
+ message.content[0].text,
310
+ file_name=f"solution.md"
 
311
  )
 
 
 
 
 
 
 
312
 
313
  except Exception as e:
314
  st.error(f"Error: {e}")
315
 
316
  # ============================================================================
317
+ # MODE 2: PROCESS DATASET FILES
318
  # ============================================================================
319
 
320
+ elif mode == "πŸ—οΈ Process Dataset Files":
321
 
322
+ st.title("πŸ—οΈ Process Files from HF Dataset")
 
323
 
324
+ st.info(f"""
325
+ **Dataset:** `{DATASET_REPO}`
326
+
327
+ Files are stored permanently in your HF Dataset.
328
+ Process them once, search forever!
329
+ """)
330
+
331
+ # Check if HF token exists
332
+ if not os.getenv("HF_TOKEN"):
333
+ st.error("⚠️ Missing HF_TOKEN! Add it in Settings β†’ Repository Secrets")
334
+ st.info("""
335
+ **How to get your HF Token:**
336
+ 1. Go to: https://huggingface.co/settings/tokens
337
+ 2. Click "New token"
338
+ 3. Name: "math-ai-access"
339
+ 4. Type: Read
340
+ 5. Copy the token
341
+ 6. Add as HF_TOKEN in Space Settings β†’ Secrets
342
+ """)
343
+ st.stop()
344
+
345
+ # Create collection if needed
346
+ st.header("Step 1: Setup Collection")
347
 
348
  try:
349
  collections = qdrant.get_collections().collections
350
  exists = any(c.name == COLLECTION_NAME for c in collections)
351
 
352
  if exists:
353
+ st.success(f"βœ… Collection exists")
354
  else:
355
+ if st.button("Create Collection"):
356
  qdrant.create_collection(
357
  collection_name=COLLECTION_NAME,
358
  vectors_config=VectorParams(size=384, distance=Distance.COSINE)
359
  )
360
+ st.success("Created!")
361
  st.rerun()
362
  except Exception as e:
363
  st.error(f"Error: {e}")
364
 
365
  st.markdown("---")
366
 
367
+ # Process files
368
+ st.header("Step 2: Process Files")
369
 
370
+ tab1, tab2, tab3 = st.tabs(["πŸ“š Books", "πŸ“ Exams", "πŸ–ŠοΈ Handwritten Answers"])
 
 
 
 
371
 
372
  # ========================================================================
373
+ # BOOKS
374
  # ========================================================================
375
 
376
  with tab1:
377
+ st.subheader("Process Books (Typed PDFs)")
 
 
 
 
 
 
 
378
 
379
+ if st.button("πŸ“š List Books in Dataset"):
380
+ book_files = list_dataset_files("books/")
381
+
382
+ if book_files:
383
+ st.write(f"Found {len(book_files)} books:")
384
+ for f in book_files:
385
+ st.text(f"β€’ {f}")
386
+
387
+ st.session_state.book_files = book_files
388
+ else:
389
+ st.warning("No books found in dataset/books/ folder")
390
 
391
+ if 'book_files' in st.session_state and st.button("πŸš€ Process All Books"):
392
 
393
+ for book_file in st.session_state.book_files:
394
+
395
+ with st.expander(f"Processing {book_file}"):
396
 
397
  try:
398
+ # Download
399
+ st.write("πŸ“₯ Downloading...")
400
+ local_path = download_file_from_dataset(book_file)
401
+
402
+ if not local_path:
403
+ continue
404
+
405
  # Extract
406
+ st.write("πŸ“– Extracting text...")
407
+ text = extract_text_from_pdf(local_path)
408
+
409
  if not text:
 
410
  continue
411
 
412
+ st.write(f"βœ… {len(text):,} chars")
413
 
414
  # Chunk
415
  chunks = chunk_text(text)
416
+ st.write(f"βœ‚οΈ {len(chunks)} chunks")
417
 
418
  # Embed
419
+ embeddings = embedder.encode(chunks, show_progress_bar=False)
 
420
 
421
  # Upload
422
  points = []
423
  for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
424
  points.append(PointStruct(
425
+ id=abs(hash(f"{book_file}_{i}_{time.time()}")) % (2**63),
426
  vector=emb.tolist(),
427
  payload={
428
  "content": chunk,
429
+ "source_name": book_file.split('/')[-1],
430
+ "source_type": "book",
431
  "chunk_index": i
432
  }
433
  ))
 
439
  st.error(f"Error: {e}")
440
 
441
  # ========================================================================
442
+ # EXAMS
443
  # ========================================================================
444
 
445
  with tab2:
446
+ st.subheader("Process Exams (Typed PDFs)")
447
 
448
+ if st.button("πŸ“ List Exams in Dataset"):
449
+ exam_files = list_dataset_files("exams/")
450
+
451
+ if exam_files:
452
+ st.write(f"Found {len(exam_files)} exams:")
453
+ for f in exam_files:
454
+ st.text(f"β€’ {f}")
455
+
456
+ st.session_state.exam_files = exam_files
457
+ else:
458
+ st.warning("No exams found")
459
 
460
+ if 'exam_files' in st.session_state and st.button("πŸš€ Process All Exams"):
461
+
462
+ for exam_file in st.session_state.exam_files:
463
+
464
+ with st.expander(f"Processing {exam_file}"):
465
+
466
+ try:
467
+ local_path = download_file_from_dataset(exam_file)
468
+ text = extract_text_from_pdf(local_path)
469
+
470
+ if not text:
471
+ continue
472
+
473
+ st.write(f"βœ… {len(text):,} chars")
474
+
475
+ chunks = chunk_text(text)
476
+ embeddings = embedder.encode(chunks, show_progress_bar=False)
477
+
478
+ points = []
479
+ for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
480
+ points.append(PointStruct(
481
+ id=abs(hash(f"{exam_file}_{i}_{time.time()}")) % (2**63),
482
+ vector=emb.tolist(),
483
+ payload={
484
+ "content": chunk,
485
+ "source_name": exam_file.split('/')[-1],
486
+ "source_type": "exam",
487
+ "chunk_index": i
488
+ }
489
+ ))
490
+
491
+ qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
492
+ st.success(f"βœ… Uploaded {len(points)} vectors!")
493
+
494
+ except Exception as e:
495
+ st.error(f"Error: {e}")
496
+
497
+ # ========================================================================
498
+ # HANDWRITTEN ANSWERS (AI OCR)
499
+ # ========================================================================
500
+
501
+ with tab3:
502
+ st.subheader("Process Handwritten Answers (AI OCR)")
503
 
504
+ st.warning("⚠️ This uses Claude Vision - costs ~$0.05-0.10 per PDF page")
 
 
 
 
 
 
 
505
 
506
+ if st.button("πŸ–ŠοΈ List Answer Files"):
507
+ answer_files = list_dataset_files("answers/")
508
+
509
+ if answer_files:
510
+ st.write(f"Found {len(answer_files)} answer files:")
511
+ for f in answer_files:
512
+ st.text(f"β€’ {f}")
513
+
514
+ st.session_state.answer_files = answer_files
515
+ else:
516
+ st.warning("No answers found")
517
 
518
+ if 'answer_files' in st.session_state:
519
+
520
+ # Get context from books if available
521
+ context_books = ""
522
  try:
523
  book_samples = qdrant.scroll(
524
  collection_name=COLLECTION_NAME,
525
+ limit=5,
526
  with_payload=True,
527
  with_vectors=False,
528
  scroll_filter={"must": [{"key": "source_type", "match": {"value": "book"}}]}
529
  )
530
 
531
  if book_samples and book_samples[0]:
532
+ context_books = "\n".join([p.payload['content'] for p in book_samples[0]])
533
+ st.info("βœ… Using book context for better OCR")
534
  except:
535
+ st.caption("No books processed yet - OCR will work but may be less accurate")
 
 
 
 
536
 
537
+ if st.button("πŸ€– PROCESS WITH AI OCR", type="primary"):
 
538
 
539
+ total_tokens = 0
540
+
541
+ for answer_file in st.session_state.answer_files:
 
 
 
 
 
 
 
 
 
 
 
 
 
542
 
543
+ with st.expander(f"Processing {answer_file}"):
544
+
545
+ try:
546
+ # Download
547
+ local_path = download_file_from_dataset(answer_file)
548
 
549
+ # Convert to images
550
+ st.write("πŸ–ΌοΈ Converting to images...")
551
+ images = pdf_to_images(local_path)
 
 
 
552
 
553
+ if not images:
554
+ continue
555
+
556
+ st.write(f"βœ… {len(images)} pages")
557
+
558
+ # OCR each page
559
+ transcribed_pages = []
560
+ page_tokens = 0
561
+
562
+ for page_num, image in enumerate(images, 1):
563
+ st.write(f"πŸ€– OCR Page {page_num}/{len(images)}...")
564
+
565
+ transcription, tokens = ocr_with_claude(
566
+ claude,
567
+ image,
568
+ context=context_books
569
+ )
570
+
571
+ if transcription:
572
+ transcribed_pages.append(f"\n=== Page {page_num} ===\n\n{transcription}")
573
+ page_tokens += tokens
574
+
575
+ if not transcribed_pages:
576
+ st.error("OCR failed")
577
+ continue
578
+
579
+ full_text = "\n\n".join(transcribed_pages)
580
+ st.success(f"βœ… Transcribed {len(full_text):,} chars")
581
+ st.info(f"Tokens: {page_tokens:,} (~${page_tokens * 0.000003:.3f})")
582
+ total_tokens += page_tokens
583
+
584
+ # Chunk
585
+ chunks = chunk_text(full_text)
586
+ embeddings = embedder.encode(chunks, show_progress_bar=False)
587
+
588
+ # Upload
589
+ points = []
590
+ for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
591
+ points.append(PointStruct(
592
+ id=abs(hash(f"{answer_file}_{i}_{time.time()}")) % (2**63),
593
+ vector=emb.tolist(),
594
+ payload={
595
+ "content": chunk,
596
+ "source_name": answer_file.split('/')[-1],
597
+ "source_type": "answer_handwritten",
598
+ "chunk_index": i,
599
+ "ocr_tokens": page_tokens
600
+ }
601
+ ))
602
+
603
+ qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
604
+ st.success(f"βœ… Uploaded {len(points)} vectors!")
605
+
606
+ except Exception as e:
607
+ st.error(f"Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
608
 
609
+ st.success(f"Total tokens: {total_tokens:,} | Cost: ${total_tokens * 0.000003:.2f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
 
611
  # ============================================================================
612
+ # MODE 3: STATS
613
  # ============================================================================
614
 
615
+ elif mode == "πŸ“Š Stats":
 
 
616
 
617
+ st.title("πŸ“Š Database Statistics")
618
 
619
+ try:
620
+ sample = qdrant.scroll(
621
+ collection_name=COLLECTION_NAME,
622
+ limit=1000,
623
+ with_payload=True,
624
+ with_vectors=False
625
+ )
626
 
627
+ if sample and sample[0]:
628
+ types = {}
629
+ sources = set()
 
 
 
 
630
 
631
+ for point in sample[0]:
632
+ src_type = point.payload.get('source_type', 'unknown')
633
+ types[src_type] = types.get(src_type, 0) + 1
634
+ sources.add(point.payload.get('source_name', 'Unknown'))
635
+
636
+ col1, col2 = st.columns(2)
637
+
638
+ with col1:
639
+ st.metric("Total Vectors", get_vector_count(qdrant))
640
+
641
+ with col2:
642
+ st.metric("Unique Sources", len(sources))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
 
644
+ st.subheader("By Type")
645
+ for doc_type, count in sorted(types.items()):
646
+ st.progress(count / sum(types.values()), text=f"{doc_type}: {count}")
647
+
648
+ st.subheader("Sources")
649
+ for src in sorted(sources):
650
+ st.caption(f"β€’ {src}")
651
+
652
+ except Exception as e:
653
+ st.error(f"Error: {e}")
654
 
 
655
  st.sidebar.caption("πŸŽ“ Math AI v1.0")