Hebaelsayed commited on
Commit
bd94ae2
Β·
verified Β·
1 Parent(s): 6470c63

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +317 -196
src/streamlit_app.py CHANGED
@@ -5,13 +5,14 @@ import base64
5
  from io import BytesIO
6
  from PIL import Image
7
  import PyPDF2
 
8
  from anthropic import Anthropic
9
  from qdrant_client import QdrantClient
10
  from qdrant_client.models import Distance, VectorParams, PointStruct
11
  from sentence_transformers import SentenceTransformer
12
 
13
  # ============================================================================
14
- # COMPLETE MATH AI SYSTEM - ALL-IN-ONE HUGGING FACE SPACE
15
  # ============================================================================
16
 
17
  st.set_page_config(
@@ -56,6 +57,95 @@ def extract_text_from_pdf(pdf_file):
56
  except Exception as e:
57
  return None
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def chunk_text(text, chunk_size=150, overlap=30):
60
  """Split text into chunks"""
61
  words = text.split()
@@ -90,10 +180,9 @@ def get_vector_count(qdrant):
90
  return 0
91
 
92
  # ============================================================================
93
- # MAIN APP
94
  # ============================================================================
95
 
96
- # Initialize clients
97
  try:
98
  qdrant, claude, embedder = get_clients()
99
  st.sidebar.success("βœ… System Ready")
@@ -103,7 +192,7 @@ except Exception as e:
103
  st.stop()
104
 
105
  # ============================================================================
106
- # SIDEBAR: MODE SELECTION
107
  # ============================================================================
108
 
109
  st.sidebar.title("πŸŽ“ Math AI System")
@@ -116,18 +205,17 @@ mode = st.sidebar.radio(
116
 
117
  st.sidebar.markdown("---")
118
 
119
- # Show database stats
120
  try:
121
  vector_count = get_vector_count(qdrant)
122
  st.sidebar.metric("Vectors in DB", f"{vector_count:,}")
123
-
124
  storage_mb = (vector_count * 384 * 4) / (1024 * 1024)
125
  st.sidebar.metric("Storage Used", f"{storage_mb:.1f} MB")
126
  except:
127
  st.sidebar.warning("Database not accessible")
128
 
129
  # ============================================================================
130
- # MODE 1: SEARCH & SOLVE (Main Interface)
131
  # ============================================================================
132
 
133
  if mode == "πŸ” Search & Solve":
@@ -135,10 +223,7 @@ if mode == "πŸ” Search & Solve":
135
  st.title("πŸ” Math Problem Solver")
136
  st.markdown("*Search your knowledge base and get detailed solutions*")
137
 
138
- # ========================================================================
139
- # INPUT: Problem Statement
140
- # ========================================================================
141
-
142
  st.header("πŸ“ Input Problem")
143
 
144
  input_method = st.radio(
@@ -152,22 +237,18 @@ if mode == "πŸ” Search & Solve":
152
  if input_method == "✍️ Type Question":
153
  problem = st.text_area(
154
  "Enter math problem:",
155
- placeholder="Example: Find the gradient of the loss function L(w) = (1/2)||Xw - y||Β²",
156
  height=150
157
  )
158
-
159
  else:
160
  uploaded_exam = st.file_uploader("Upload exam PDF:", type=['pdf'])
161
  if uploaded_exam:
162
  exam_text = extract_text_from_pdf(uploaded_exam)
163
  if exam_text:
164
- st.text_area("Extracted text:", exam_text[:1000], height=200)
165
- problem = st.text_input("Extract specific question or use full text")
166
-
167
- # ========================================================================
168
- # SETTINGS
169
- # ========================================================================
170
 
 
171
  with st.expander("βš™οΈ Advanced Settings"):
172
  col1, col2 = st.columns(2)
173
 
@@ -186,29 +267,13 @@ if mode == "πŸ” Search & Solve":
186
  value="Detailed"
187
  )
188
 
189
- # ========================================================================
190
- # SOLVE BUTTON
191
- # ========================================================================
192
-
193
  if st.button("πŸš€ SOLVE PROBLEM", type="primary") and problem:
194
 
195
- with st.spinner("πŸ” Searching knowledge base..."):
196
 
197
- # Generate query embedding
198
  query_embedding = embedder.encode(problem)
199
 
200
- # Create filter
201
- filter_types = []
202
- if "Books" in search_filter:
203
- filter_types.append("book")
204
- if "Exams" in search_filter:
205
- filter_types.append("exam")
206
- if "Handwritten Solutions" in search_filter:
207
- filter_types.append("answer_handwritten")
208
- if "Public Datasets" in search_filter:
209
- filter_types.append("public_dataset")
210
-
211
- # Search Qdrant
212
  try:
213
  results = qdrant.search(
214
  collection_name=COLLECTION_NAME,
@@ -220,12 +285,11 @@ if mode == "πŸ” Search & Solve":
220
  results = []
221
 
222
  if not results:
223
- st.warning("No relevant context found. Try loading more data in Setup mode.")
224
-
225
  else:
226
- st.success(f"βœ… Found {len(results)} relevant references!")
227
 
228
- # Show retrieved context
229
  with st.expander("πŸ“š Retrieved References"):
230
  for i, result in enumerate(results, 1):
231
  similarity = result.score * 100
@@ -234,59 +298,48 @@ if mode == "πŸ” Search & Solve":
234
  st.caption(f"Source: {result.payload.get('source_name', 'Unknown')}")
235
  st.markdown("---")
236
 
237
- # Generate solution with Claude
238
- with st.spinner("πŸ€– Claude is generating solution..."):
239
 
240
- # Prepare context
241
  context = "\n\n".join([
242
- f"[Reference {i+1} from {r.payload.get('source_name', 'Unknown')}]:\n{r.payload['content']}"
243
  for i, r in enumerate(results)
244
  ])
245
 
246
- # Determine detail level
247
  detail_instructions = {
248
- "Concise": "Provide a brief solution focusing on key steps.",
249
- "Standard": "Provide a clear solution with main steps explained.",
250
- "Detailed": "Provide a comprehensive solution with detailed explanations.",
251
- "Very Detailed": "Provide an exhaustive solution with all intermediate steps, intuitions, and alternative approaches."
252
  }
253
 
254
- # Create prompt
255
- prompt = f"""You are an expert mathematics tutor specializing in machine learning mathematics.
256
 
257
- PROBLEM TO SOLVE:
258
  {problem}
259
 
260
- REFERENCE MATERIALS (from student's books, exams, and notes):
261
  {context}
262
 
263
- TASK:
264
- Solve this problem providing a complete, educational solution.
265
 
266
  {detail_instructions[detail_level]}
267
 
268
- FORMAT YOUR RESPONSE EXACTLY LIKE THIS:
269
 
270
  ## SOLUTION
271
-
272
- [Provide step-by-step solution here with clear mathematical notation]
273
 
274
  ## REASONING & APPROACH
275
-
276
- [Explain WHY you chose this approach, what concepts are involved, and how the references helped]
277
 
278
  ## REFERENCES USED
279
-
280
- [List which references you used and HOW each contributed to the solution. Be specific - mention what information came from which source]
281
 
282
  ## VERIFICATION
 
283
 
284
- [If applicable, verify the solution or discuss how to check if it's correct]
285
-
286
- IMPORTANT:
287
- - Use proper mathematical notation (LaTeX if needed: ∫, βˆ‘, βˆ‚, etc.)
288
- - Reference the student's materials when explaining concepts
289
- - Make it educational - help them understand, not just get an answer"""
290
 
291
  try:
292
  message = claude.messages.create(
@@ -297,11 +350,9 @@ IMPORTANT:
297
 
298
  solution = message.content[0].text
299
 
300
- # Display solution
301
  st.markdown("---")
302
  st.markdown(solution)
303
 
304
- # Download option
305
  st.download_button(
306
  "πŸ“₯ Download Solution",
307
  solution,
@@ -309,46 +360,27 @@ IMPORTANT:
309
  mime="text/markdown"
310
  )
311
 
312
- # API usage
313
  with st.expander("πŸ“Š API Usage"):
314
  st.json({
315
- "model": "claude-sonnet-4-20250514",
316
  "input_tokens": message.usage.input_tokens,
317
  "output_tokens": message.usage.output_tokens,
318
- "cost_estimate": f"${(message.usage.input_tokens * 0.000003 + message.usage.output_tokens * 0.000015):.4f}"
319
  })
320
 
321
  except Exception as e:
322
- st.error(f"Claude error: {e}")
323
 
324
  # ============================================================================
325
- # MODE 2: SETUP DATABASE (One-Time Processing)
326
  # ============================================================================
327
 
328
  elif mode == "πŸ—οΈ Setup Database":
329
 
330
  st.title("πŸ—οΈ Database Setup")
331
- st.markdown("*Process and upload your documents (run once)*")
332
 
333
- st.warning("""
334
- ⚠️ **IMPORTANT LIMITATION**:
335
-
336
- Hugging Face Spaces cannot directly access Google Drive files.
337
-
338
- **Recommended Solution:**
339
- 1. Use **Google Colab** for one-time processing (cloud, free)
340
- 2. Use **this HF Space** for daily searching/solving
341
-
342
- **Alternative (Manual)**:
343
- - Download PDFs from Google Drive
344
- - Upload them here one by one
345
- """)
346
-
347
- # ========================================================================
348
- # CREATE COLLECTION
349
- # ========================================================================
350
-
351
- st.header("Step 1: Create Database Collection")
352
 
353
  try:
354
  collections = qdrant.get_collections().collections
@@ -369,26 +401,32 @@ elif mode == "πŸ—οΈ Setup Database":
369
 
370
  st.markdown("---")
371
 
372
- # ========================================================================
373
- # UPLOAD OPTIONS
374
- # ========================================================================
375
-
376
  st.header("Step 2: Upload Documents")
377
 
378
- tab1, tab2, tab3 = st.tabs(["πŸ“š Upload PDFs", "πŸ“Š Load Public Datasets", "πŸ–ŠοΈ Process Handwritten (Colab)"])
 
 
 
 
 
 
 
 
379
 
380
  with tab1:
381
- st.info("Upload your books and typed exams here")
382
 
383
  uploaded_files = st.file_uploader(
384
  "Choose PDF files:",
385
  type=['pdf'],
386
- accept_multiple_files=True
 
387
  )
388
 
389
- doc_type = st.selectbox("Document type:", ["Book", "Exam", "Other"])
390
 
391
- if uploaded_files and st.button("Process & Upload PDFs"):
392
 
393
  for uploaded_file in uploaded_files:
394
  with st.expander(f"Processing {uploaded_file.name}"):
@@ -397,7 +435,7 @@ elif mode == "πŸ—οΈ Setup Database":
397
  # Extract
398
  text = extract_text_from_pdf(uploaded_file)
399
  if not text:
400
- st.error("Failed to extract text")
401
  continue
402
 
403
  st.write(f"βœ… Extracted {len(text):,} chars")
@@ -407,7 +445,8 @@ elif mode == "πŸ—οΈ Setup Database":
407
  st.write(f"βœ… Created {len(chunks)} chunks")
408
 
409
  # Embed
410
- embeddings = embedder.encode(chunks, show_progress_bar=False)
 
411
 
412
  # Upload
413
  points = []
@@ -418,7 +457,7 @@ elif mode == "πŸ—οΈ Setup Database":
418
  payload={
419
  "content": chunk,
420
  "source_name": uploaded_file.name,
421
- "source_type": doc_type.lower(),
422
  "chunk_index": i
423
  }
424
  ))
@@ -429,50 +468,200 @@ elif mode == "πŸ—οΈ Setup Database":
429
  except Exception as e:
430
  st.error(f"Error: {e}")
431
 
 
 
 
 
432
  with tab2:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  st.info("Load pre-built math datasets")
434
 
435
  dataset_choice = st.selectbox(
436
  "Choose dataset:",
437
- ["GSM8K", "MATH", "MathQA"]
 
 
438
  )
439
 
440
- sample_size = st.slider("Number of samples:", 10, 1000, 100)
441
 
442
- if st.button("Load Dataset"):
443
  try:
444
  from datasets import load_dataset
445
 
446
- with st.spinner(f"Loading {dataset_choice}..."):
447
 
448
- if dataset_choice == "GSM8K":
449
  dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
450
  texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
451
  for i in range(min(sample_size, len(dataset)))]
 
452
 
453
- elif dataset_choice == "MATH":
454
  dataset = load_dataset("lighteval/MATH", split="train", trust_remote_code=True)
455
  texts = [f"Problem: {dataset[i].get('problem', '')}\n\nSolution: {dataset[i].get('solution', '')}"
456
  for i in range(min(sample_size, len(dataset)))]
 
457
 
458
- else: # MathQA
459
  dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
460
  texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
461
  for i in range(min(sample_size, len(dataset)))]
 
462
 
463
  st.write(f"βœ… Loaded {len(texts)} problems")
464
 
465
- # Embed & upload
466
  embeddings = embedder.encode(texts, show_progress_bar=True)
467
 
 
468
  points = []
469
  for i, (text, emb) in enumerate(zip(texts, embeddings)):
470
  points.append(PointStruct(
471
- id=abs(hash(f"{dataset_choice}_{i}_{time.time()}")) % (2**63),
472
  vector=emb.tolist(),
473
  payload={
474
  "content": text[:2000],
475
- "source_name": dataset_choice,
476
  "source_type": "public_dataset",
477
  "index": i
478
  }
@@ -484,64 +673,21 @@ elif mode == "πŸ—οΈ Setup Database":
484
 
485
  except Exception as e:
486
  st.error(f"Error: {e}")
487
-
488
- with tab3:
489
- st.warning("**Handwritten OCR requires Google Colab** (HF Spaces limitation)")
490
-
491
- st.markdown("""
492
- ### Why Colab for Handwritten Notes?
493
-
494
- 1. **File Access**: Need direct Google Drive access
495
- 2. **Processing Power**: OCR is compute-intensive
496
- 3. **Image Processing**: Requires additional libraries
497
-
498
- ### Steps:
499
-
500
- 1. **Click button below** to open ready-to-use Colab notebook
501
- 2. **Run the notebook** (processes handwritten PDFs with AI OCR)
502
- 3. **Vectors auto-upload** to your Qdrant database
503
- 4. **Come back here** to search!
504
-
505
- The notebook handles:
506
- - βœ… Google Drive connection
507
- - βœ… Italian cursive handwriting OCR (Claude Vision)
508
- - βœ… Context from books/exams
509
- - βœ… Direct upload to Qdrant
510
- """)
511
-
512
- colab_code_url = "https://colab.research.google.com/drive/your-notebook-id"
513
-
514
- st.link_button(
515
- "πŸ““ Open Google Colab Notebook",
516
- colab_code_url,
517
- use_container_width=True
518
- )
519
-
520
- st.info("""
521
- **What the Colab notebook will do:**
522
- - Connect to your Google Drive (one click)
523
- - Read PDFs from Math_AI_Documents/answers/
524
- - Use Claude Vision to OCR handwritten Italian cursive
525
- - Upload directly to this same Qdrant database
526
- - Takes ~30-60 minutes, costs ~$0.60
527
- """)
528
 
529
  # ============================================================================
530
- # MODE 3: TESTING DASHBOARD
531
  # ============================================================================
532
 
533
  elif mode == "πŸ§ͺ Testing Dashboard":
534
 
535
  st.title("πŸ§ͺ Testing Dashboard")
536
- st.markdown("*Evaluate system performance*")
537
 
538
- tab1, tab2, tab3 = st.tabs(["πŸ“Š Database Stats", "🎯 Accuracy Tests", "πŸ“ˆ Performance"])
539
 
540
  with tab1:
541
  st.header("Database Statistics")
542
 
543
  try:
544
- # Get sample
545
  sample = qdrant.scroll(
546
  collection_name=COLLECTION_NAME,
547
  limit=1000,
@@ -550,7 +696,6 @@ elif mode == "πŸ§ͺ Testing Dashboard":
550
  )
551
 
552
  if sample and sample[0]:
553
- # Count by type
554
  types = {}
555
  sources = set()
556
 
@@ -559,67 +704,43 @@ elif mode == "πŸ§ͺ Testing Dashboard":
559
  types[src_type] = types.get(src_type, 0) + 1
560
  sources.add(point.payload.get('source_name', 'Unknown'))
561
 
562
- # Display
563
  col1, col2, col3 = st.columns(3)
564
 
565
  with col1:
566
  st.metric("Total Vectors", get_vector_count(qdrant))
567
 
568
  with col2:
569
- st.metric("Unique Sources", len(sources))
570
 
571
  with col3:
572
- st.metric("Document Types", len(types))
573
 
574
- # Breakdown
575
- st.subheader("Breakdown by Type")
576
  for doc_type, count in sorted(types.items()):
577
  st.progress(count / sum(types.values()), text=f"{doc_type}: {count}")
578
-
579
- # Sources
580
- st.subheader("Sources")
581
- for src in sorted(sources)[:20]:
582
- st.caption(f"β€’ {src}")
583
 
584
  except Exception as e:
585
  st.error(f"Error: {e}")
586
 
587
  with tab2:
588
- st.header("Test Search Accuracy")
589
 
590
- test_query = st.text_input("Test query:", placeholder="gradient descent")
591
 
592
- if st.button("Run Test Search") and test_query:
593
-
594
  query_emb = embedder.encode(test_query)
595
-
596
  results = qdrant.search(
597
  collection_name=COLLECTION_NAME,
598
  query_vector=query_emb.tolist(),
599
  limit=5
600
  )
601
 
602
- st.write(f"**Found {len(results)} results:**")
603
-
604
  for i, r in enumerate(results, 1):
605
  similarity = r.score * 100
606
-
607
- quality = "🟒 Excellent" if similarity > 70 else "🟑 Good" if similarity > 50 else "πŸ”΄ Fair"
608
-
609
  st.markdown(f"**{i}. {quality}** ({similarity:.1f}%)")
610
  st.text(r.payload['content'][:200] + "...")
611
- st.caption(f"Source: {r.payload.get('source_name')}")
612
  st.markdown("---")
613
-
614
- with tab3:
615
- st.header("Performance Metrics")
616
-
617
- st.info("Coming soon: Response time, token usage, cost tracking")
618
-
619
- # ============================================================================
620
- # FOOTER
621
- # ============================================================================
622
 
623
  st.sidebar.markdown("---")
624
- st.sidebar.caption("πŸŽ“ Math AI System v1.0")
625
- st.sidebar.caption("Powered by Claude + Qdrant")
 
5
  from io import BytesIO
6
  from PIL import Image
7
  import PyPDF2
8
+ from pdf2image import convert_from_bytes
9
  from anthropic import Anthropic
10
  from qdrant_client import QdrantClient
11
  from qdrant_client.models import Distance, VectorParams, PointStruct
12
  from sentence_transformers import SentenceTransformer
13
 
14
  # ============================================================================
15
+ # COMPLETE MATH AI SYSTEM - 100% HUGGING FACE
16
  # ============================================================================
17
 
18
  st.set_page_config(
 
57
  except Exception as e:
58
  return None
59
 
60
+ def pdf_to_images(pdf_bytes):
61
+ """Convert PDF pages to images for OCR"""
62
+ try:
63
+ images = convert_from_bytes(pdf_bytes.read(), dpi=200)
64
+ return images
65
+ except Exception as e:
66
+ st.error(f"PDF to image conversion error: {e}")
67
+ return []
68
+
69
+ def resize_image(image, max_size=(2048, 2048)):
70
+ """Resize image for Claude Vision"""
71
+ image.thumbnail(max_size, Image.Resampling.LANCZOS)
72
+ return image
73
+
74
+ def image_to_base64(image):
75
+ """Convert PIL Image to base64"""
76
+ buffered = BytesIO()
77
+ image.save(buffered, format="PNG")
78
+ return base64.b64encode(buffered.getvalue()).decode()
79
+
80
+ def ocr_with_claude(claude_client, image, context_books="", context_exam=""):
81
+ """
82
+ AI-powered OCR for handwritten Italian cursive math notes
83
+
84
+ NOTE: Italian cursive is the HANDWRITING STYLE (connected letters)
85
+ Language is ENGLISH
86
+ """
87
+
88
+ resized = resize_image(image.copy())
89
+ img_b64 = image_to_base64(resized)
90
+
91
+ prompt = f"""You are an expert in transcribing handwritten mathematical solutions.
92
+
93
+ IMPORTANT: This is written in ITALIAN CURSIVE style (connected, flowing letters), but the LANGUAGE IS ENGLISH.
94
+
95
+ CONTEXT FROM TEXTBOOKS (helps understand symbols):
96
+ {context_books[:2000] if context_books else "No context available"}
97
+
98
+ EXAM QUESTION (helps understand what's being solved):
99
+ {context_exam[:1000] if context_exam else "No exam question available"}
100
+
101
+ TASK: Transcribe this handwritten math solution into clean, readable text.
102
+
103
+ INSTRUCTIONS:
104
+ 1. Language is ENGLISH (just cursive style is Italian)
105
+ 2. Convert math notation properly:
106
+ - Use standard symbols: ∫, βˆ‘, √, βˆ‚, lim, etc.
107
+ - Use LaTeX for complex formulas
108
+ - Preserve Greek letters: Ξ±, Ξ², Ξ³, Ο€, etc.
109
+ 3. Maintain structure (paragraphs, steps)
110
+ 4. If unclear, mark as [unclear: best guess]
111
+ 5. Describe diagrams as [DIAGRAM: description]
112
+
113
+ OUTPUT: Just the transcribed text, no preamble."""
114
+
115
+ try:
116
+ message = claude_client.messages.create(
117
+ model="claude-sonnet-4-20250514",
118
+ max_tokens=4000,
119
+ messages=[
120
+ {
121
+ "role": "user",
122
+ "content": [
123
+ {
124
+ "type": "image",
125
+ "source": {
126
+ "type": "base64",
127
+ "media_type": "image/png",
128
+ "data": img_b64
129
+ }
130
+ },
131
+ {
132
+ "type": "text",
133
+ "text": prompt
134
+ }
135
+ ]
136
+ }
137
+ ]
138
+ )
139
+
140
+ transcription = message.content[0].text
141
+ tokens = message.usage.input_tokens + message.usage.output_tokens
142
+
143
+ return transcription, tokens
144
+
145
+ except Exception as e:
146
+ st.error(f"OCR error: {e}")
147
+ return None, 0
148
+
149
  def chunk_text(text, chunk_size=150, overlap=30):
150
  """Split text into chunks"""
151
  words = text.split()
 
180
  return 0
181
 
182
  # ============================================================================
183
+ # INITIALIZE
184
  # ============================================================================
185
 
 
186
  try:
187
  qdrant, claude, embedder = get_clients()
188
  st.sidebar.success("βœ… System Ready")
 
192
  st.stop()
193
 
194
  # ============================================================================
195
+ # SIDEBAR
196
  # ============================================================================
197
 
198
  st.sidebar.title("πŸŽ“ Math AI System")
 
205
 
206
  st.sidebar.markdown("---")
207
 
208
+ # Database stats
209
  try:
210
  vector_count = get_vector_count(qdrant)
211
  st.sidebar.metric("Vectors in DB", f"{vector_count:,}")
 
212
  storage_mb = (vector_count * 384 * 4) / (1024 * 1024)
213
  st.sidebar.metric("Storage Used", f"{storage_mb:.1f} MB")
214
  except:
215
  st.sidebar.warning("Database not accessible")
216
 
217
  # ============================================================================
218
+ # MODE 1: SEARCH & SOLVE
219
  # ============================================================================
220
 
221
  if mode == "πŸ” Search & Solve":
 
223
  st.title("πŸ” Math Problem Solver")
224
  st.markdown("*Search your knowledge base and get detailed solutions*")
225
 
226
+ # Input
 
 
 
227
  st.header("πŸ“ Input Problem")
228
 
229
  input_method = st.radio(
 
237
  if input_method == "✍️ Type Question":
238
  problem = st.text_area(
239
  "Enter math problem:",
240
+ placeholder="Example: Find the gradient of L(w) = (1/2)||Xw - y||Β²",
241
  height=150
242
  )
 
243
  else:
244
  uploaded_exam = st.file_uploader("Upload exam PDF:", type=['pdf'])
245
  if uploaded_exam:
246
  exam_text = extract_text_from_pdf(uploaded_exam)
247
  if exam_text:
248
+ st.text_area("Extracted:", exam_text[:1000], height=200)
249
+ problem = st.text_input("Specific question or use full text")
 
 
 
 
250
 
251
+ # Settings
252
  with st.expander("βš™οΈ Advanced Settings"):
253
  col1, col2 = st.columns(2)
254
 
 
267
  value="Detailed"
268
  )
269
 
270
+ # Solve
 
 
 
271
  if st.button("πŸš€ SOLVE PROBLEM", type="primary") and problem:
272
 
273
+ with st.spinner("πŸ” Searching..."):
274
 
 
275
  query_embedding = embedder.encode(problem)
276
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  try:
278
  results = qdrant.search(
279
  collection_name=COLLECTION_NAME,
 
285
  results = []
286
 
287
  if not results:
288
+ st.warning("No relevant context found. Load data in Setup mode.")
 
289
  else:
290
+ st.success(f"βœ… Found {len(results)} references!")
291
 
292
+ # Show context
293
  with st.expander("πŸ“š Retrieved References"):
294
  for i, result in enumerate(results, 1):
295
  similarity = result.score * 100
 
298
  st.caption(f"Source: {result.payload.get('source_name', 'Unknown')}")
299
  st.markdown("---")
300
 
301
+ # Generate solution
302
+ with st.spinner("πŸ€– Generating solution..."):
303
 
 
304
  context = "\n\n".join([
305
+ f"[Reference {i+1} from {r.payload.get('source_name')}]:\n{r.payload['content']}"
306
  for i, r in enumerate(results)
307
  ])
308
 
 
309
  detail_instructions = {
310
+ "Concise": "Brief solution, key steps only.",
311
+ "Standard": "Clear solution with main steps.",
312
+ "Detailed": "Comprehensive solution with detailed explanations.",
313
+ "Very Detailed": "Exhaustive solution with all steps and intuitions."
314
  }
315
 
316
+ prompt = f"""You are an expert mathematics tutor for machine learning.
 
317
 
318
+ PROBLEM:
319
  {problem}
320
 
321
+ REFERENCES (from student's materials):
322
  {context}
323
 
324
+ TASK: Solve providing a complete educational solution.
 
325
 
326
  {detail_instructions[detail_level]}
327
 
328
+ FORMAT:
329
 
330
  ## SOLUTION
331
+ [Step-by-step solution with clear notation]
 
332
 
333
  ## REASONING & APPROACH
334
+ [WHY this approach, what concepts, how references helped]
 
335
 
336
  ## REFERENCES USED
337
+ [Which references used and HOW each contributed]
 
338
 
339
  ## VERIFICATION
340
+ [How to verify the solution]
341
 
342
+ Use proper notation (LaTeX if needed). Reference the materials when explaining."""
 
 
 
 
 
343
 
344
  try:
345
  message = claude.messages.create(
 
350
 
351
  solution = message.content[0].text
352
 
 
353
  st.markdown("---")
354
  st.markdown(solution)
355
 
 
356
  st.download_button(
357
  "πŸ“₯ Download Solution",
358
  solution,
 
360
  mime="text/markdown"
361
  )
362
 
 
363
  with st.expander("πŸ“Š API Usage"):
364
  st.json({
 
365
  "input_tokens": message.usage.input_tokens,
366
  "output_tokens": message.usage.output_tokens,
367
+ "cost": f"${(message.usage.input_tokens * 0.000003 + message.usage.output_tokens * 0.000015):.4f}"
368
  })
369
 
370
  except Exception as e:
371
+ st.error(f"Error: {e}")
372
 
373
  # ============================================================================
374
+ # MODE 2: SETUP DATABASE
375
  # ============================================================================
376
 
377
  elif mode == "πŸ—οΈ Setup Database":
378
 
379
  st.title("πŸ—οΈ Database Setup")
380
+ st.markdown("*Upload and process your documents*")
381
 
382
+ # Create collection
383
+ st.header("Step 1: Create Collection")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
 
385
  try:
386
  collections = qdrant.get_collections().collections
 
401
 
402
  st.markdown("---")
403
 
404
+ # Upload documents
 
 
 
405
  st.header("Step 2: Upload Documents")
406
 
407
+ tab1, tab2, tab3 = st.tabs([
408
+ "πŸ“š Books & Exams (Typed PDFs)",
409
+ "πŸ–ŠοΈ Handwritten Solutions (OCR)",
410
+ "πŸ“Š Public Datasets"
411
+ ])
412
+
413
+ # ========================================================================
414
+ # TAB 1: Typed PDFs
415
+ # ========================================================================
416
 
417
  with tab1:
418
+ st.info("βœ… Upload your typed PDFs (books, exams) here")
419
 
420
  uploaded_files = st.file_uploader(
421
  "Choose PDF files:",
422
  type=['pdf'],
423
+ accept_multiple_files=True,
424
+ key="typed_pdfs"
425
  )
426
 
427
+ doc_type = st.selectbox("Document type:", ["book", "exam", "reference"])
428
 
429
+ if uploaded_files and st.button("πŸ“€ Process & Upload", key="upload_typed"):
430
 
431
  for uploaded_file in uploaded_files:
432
  with st.expander(f"Processing {uploaded_file.name}"):
 
435
  # Extract
436
  text = extract_text_from_pdf(uploaded_file)
437
  if not text:
438
+ st.error("Text extraction failed")
439
  continue
440
 
441
  st.write(f"βœ… Extracted {len(text):,} chars")
 
445
  st.write(f"βœ… Created {len(chunks)} chunks")
446
 
447
  # Embed
448
+ with st.spinner("Embedding..."):
449
+ embeddings = embedder.encode(chunks, show_progress_bar=False)
450
 
451
  # Upload
452
  points = []
 
457
  payload={
458
  "content": chunk,
459
  "source_name": uploaded_file.name,
460
+ "source_type": doc_type,
461
  "chunk_index": i
462
  }
463
  ))
 
468
  except Exception as e:
469
  st.error(f"Error: {e}")
470
 
471
+ # ========================================================================
472
+ # TAB 2: Handwritten OCR (100% IN HF SPACES!)
473
+ # ========================================================================
474
+
475
  with tab2:
476
+ st.success("βœ… AI-POWERED OCR - Process handwritten notes RIGHT HERE!")
477
+
478
+ st.markdown("""
479
+ ### How it works:
480
+ 1. Upload handwritten solution PDFs (from your Google Drive)
481
+ 2. AI OCR processes each page with Claude Vision
482
+ 3. Uses your books/exams as context for better accuracy
483
+ 4. Uploads transcribed text to database
484
+
485
+ **Cost:** ~$0.05-0.10 per handwritten PDF page
486
+ """)
487
+
488
+ # Upload handwritten PDFs
489
+ handwritten_files = st.file_uploader(
490
+ "Upload handwritten solution PDFs:",
491
+ type=['pdf'],
492
+ accept_multiple_files=True,
493
+ key="handwritten_pdfs",
494
+ help="Your answer PDFs from Google Drive/Math_AI_Documents/answers/"
495
+ )
496
+
497
+ # Optional: Context from books
498
+ context_books = ""
499
+ use_context = st.checkbox("Use book context for better OCR accuracy", value=True)
500
+
501
+ if use_context:
502
+ # Get some book context from database
503
+ try:
504
+ book_samples = qdrant.scroll(
505
+ collection_name=COLLECTION_NAME,
506
+ limit=10,
507
+ with_payload=True,
508
+ with_vectors=False,
509
+ scroll_filter={"must": [{"key": "source_type", "match": {"value": "book"}}]}
510
+ )
511
+
512
+ if book_samples and book_samples[0]:
513
+ context_books = "\n".join([p.payload['content'] for p in book_samples[0][:5]])
514
+ st.caption(f"βœ… Using {len(book_samples[0])} book excerpts as context")
515
+ except:
516
+ st.caption("⚠️ No books in database yet. OCR will work but may be less accurate.")
517
+
518
+ if handwritten_files and st.button("πŸ€– PROCESS WITH AI OCR", type="primary"):
519
+
520
+ total_tokens = 0
521
+
522
+ for uploaded_file in handwritten_files:
523
+ st.markdown(f"### Processing: {uploaded_file.name}")
524
+
525
+ try:
526
+ # Convert PDF to images
527
+ with st.spinner("Converting PDF to images..."):
528
+ # Read bytes
529
+ pdf_bytes = BytesIO(uploaded_file.read())
530
+ images = pdf_to_images(pdf_bytes)
531
+
532
+ if not images:
533
+ st.error("PDF conversion failed")
534
+ continue
535
+
536
+ st.write(f"βœ… Converted to {len(images)} pages")
537
+
538
+ # OCR each page
539
+ transcribed_pages = []
540
+ page_tokens = 0
541
+
542
+ for page_num, image in enumerate(images, 1):
543
+ with st.spinner(f"OCR Page {page_num}/{len(images)}..."):
544
+
545
+ transcription, tokens = ocr_with_claude(
546
+ claude,
547
+ image,
548
+ context_books=context_books,
549
+ context_exam=""
550
+ )
551
+
552
+ if transcription:
553
+ transcribed_pages.append(f"\n=== Page {page_num} ===\n\n{transcription}")
554
+ page_tokens += tokens
555
+ st.write(f" βœ… Page {page_num} ({tokens:,} tokens)")
556
+ else:
557
+ st.write(f" ❌ Page {page_num} failed")
558
+
559
+ if not transcribed_pages:
560
+ st.error("No pages transcribed successfully")
561
+ continue
562
+
563
+ # Combine all pages
564
+ full_text = "\n\n".join(transcribed_pages)
565
+ st.success(f"βœ… Transcribed {len(full_text):,} characters")
566
+ st.info(f"πŸ“Š Tokens used: {page_tokens:,} (~${page_tokens * 0.000003:.3f})")
567
+ total_tokens += page_tokens
568
+
569
+ # Show preview
570
+ with st.expander("πŸ‘οΈ Preview transcription"):
571
+ st.text(full_text[:500] + "...")
572
+
573
+ # Chunk
574
+ chunks = chunk_text(full_text)
575
+ st.write(f"βœ… Created {len(chunks)} chunks")
576
+
577
+ # Embed
578
+ with st.spinner("Embedding..."):
579
+ embeddings = embedder.encode(chunks, show_progress_bar=False)
580
+
581
+ # Upload
582
+ points = []
583
+ for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
584
+ points.append(PointStruct(
585
+ id=abs(hash(f"handwritten_{uploaded_file.name}_{i}_{time.time()}")) % (2**63),
586
+ vector=emb.tolist(),
587
+ payload={
588
+ "content": chunk,
589
+ "source_name": uploaded_file.name,
590
+ "source_type": "answer_handwritten",
591
+ "chunk_index": i,
592
+ "handwriting_style": "italian_cursive",
593
+ "language": "english",
594
+ "ocr_method": "claude_vision",
595
+ "tokens_used": page_tokens
596
+ }
597
+ ))
598
+
599
+ qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
600
+ st.success(f"πŸŽ‰ Uploaded {len(points)} vectors from handwritten notes!")
601
+ st.balloons()
602
+
603
+ except Exception as e:
604
+ st.error(f"Error: {e}")
605
+ st.exception(e)
606
+
607
+ st.markdown("---")
608
+ st.success(f"βœ… Total tokens used: {total_tokens:,}")
609
+ st.info(f"πŸ’° Estimated total cost: ${total_tokens * 0.000003:.2f}")
610
+
611
+ # ========================================================================
612
+ # TAB 3: Public Datasets
613
+ # ========================================================================
614
+
615
+ with tab3:
616
  st.info("Load pre-built math datasets")
617
 
618
  dataset_choice = st.selectbox(
619
  "Choose dataset:",
620
+ ["GSM8K - Grade School Math",
621
+ "MATH - Competition Math",
622
+ "MathQA - Word Problems"]
623
  )
624
 
625
+ sample_size = st.slider("Samples:", 10, 1000, 100)
626
 
627
+ if st.button("πŸ“₯ Load Dataset"):
628
  try:
629
  from datasets import load_dataset
630
 
631
+ with st.spinner(f"Loading..."):
632
 
633
+ if "GSM8K" in dataset_choice:
634
  dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
635
  texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
636
  for i in range(min(sample_size, len(dataset)))]
637
+ name = "GSM8K"
638
 
639
+ elif "MATH" in dataset_choice:
640
  dataset = load_dataset("lighteval/MATH", split="train", trust_remote_code=True)
641
  texts = [f"Problem: {dataset[i].get('problem', '')}\n\nSolution: {dataset[i].get('solution', '')}"
642
  for i in range(min(sample_size, len(dataset)))]
643
+ name = "MATH"
644
 
645
+ else:
646
  dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
647
  texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
648
  for i in range(min(sample_size, len(dataset)))]
649
+ name = "MathQA"
650
 
651
  st.write(f"βœ… Loaded {len(texts)} problems")
652
 
653
+ # Embed
654
  embeddings = embedder.encode(texts, show_progress_bar=True)
655
 
656
+ # Upload
657
  points = []
658
  for i, (text, emb) in enumerate(zip(texts, embeddings)):
659
  points.append(PointStruct(
660
+ id=abs(hash(f"{name}_{i}_{time.time()}")) % (2**63),
661
  vector=emb.tolist(),
662
  payload={
663
  "content": text[:2000],
664
+ "source_name": name,
665
  "source_type": "public_dataset",
666
  "index": i
667
  }
 
673
 
674
  except Exception as e:
675
  st.error(f"Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
 
677
  # ============================================================================
678
+ # MODE 3: TESTING
679
  # ============================================================================
680
 
681
  elif mode == "πŸ§ͺ Testing Dashboard":
682
 
683
  st.title("πŸ§ͺ Testing Dashboard")
 
684
 
685
+ tab1, tab2 = st.tabs(["πŸ“Š Stats", "🎯 Accuracy"])
686
 
687
  with tab1:
688
  st.header("Database Statistics")
689
 
690
  try:
 
691
  sample = qdrant.scroll(
692
  collection_name=COLLECTION_NAME,
693
  limit=1000,
 
696
  )
697
 
698
  if sample and sample[0]:
 
699
  types = {}
700
  sources = set()
701
 
 
704
  types[src_type] = types.get(src_type, 0) + 1
705
  sources.add(point.payload.get('source_name', 'Unknown'))
706
 
 
707
  col1, col2, col3 = st.columns(3)
708
 
709
  with col1:
710
  st.metric("Total Vectors", get_vector_count(qdrant))
711
 
712
  with col2:
713
+ st.metric("Sources", len(sources))
714
 
715
  with col3:
716
+ st.metric("Types", len(types))
717
 
718
+ st.subheader("By Type")
 
719
  for doc_type, count in sorted(types.items()):
720
  st.progress(count / sum(types.values()), text=f"{doc_type}: {count}")
 
 
 
 
 
721
 
722
  except Exception as e:
723
  st.error(f"Error: {e}")
724
 
725
  with tab2:
726
+ st.header("Test Accuracy")
727
 
728
+ test_query = st.text_input("Test query:")
729
 
730
+ if st.button("Test") and test_query:
 
731
  query_emb = embedder.encode(test_query)
 
732
  results = qdrant.search(
733
  collection_name=COLLECTION_NAME,
734
  query_vector=query_emb.tolist(),
735
  limit=5
736
  )
737
 
 
 
738
  for i, r in enumerate(results, 1):
739
  similarity = r.score * 100
740
+ quality = "🟒" if similarity > 70 else "🟑" if similarity > 50 else "πŸ”΄"
 
 
741
  st.markdown(f"**{i}. {quality}** ({similarity:.1f}%)")
742
  st.text(r.payload['content'][:200] + "...")
 
743
  st.markdown("---")
 
 
 
 
 
 
 
 
 
744
 
745
  st.sidebar.markdown("---")
746
+ st.sidebar.caption("πŸŽ“ Math AI v1.0")