Hebaelsayed commited on
Commit
3fbb8fa
Β·
verified Β·
1 Parent(s): 0872929

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +377 -96
src/streamlit_app.py CHANGED
@@ -90,6 +90,49 @@ def check_if_processed(qdrant, file_name):
90
  except:
91
  return False
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  def list_dataset_files(folder_path):
94
  """List PDFs in HF Dataset folder"""
95
  try:
@@ -213,6 +256,17 @@ def get_vector_count(qdrant):
213
  except:
214
  return 0
215
 
 
 
 
 
 
 
 
 
 
 
 
216
  # ============================================================================
217
  # INITIALIZE
218
  # ============================================================================
@@ -235,8 +289,20 @@ st.sidebar.caption("Production v2.0")
235
 
236
  try:
237
  vector_count = get_vector_count(qdrant)
238
- st.sidebar.metric("Vectors", f"{vector_count:,}")
239
- st.sidebar.metric("Storage", f"{(vector_count * 384 * 4) / (1024 * 1024):.1f} MB")
 
 
 
 
 
 
 
 
 
 
 
 
240
  except:
241
  st.sidebar.warning("DB unavailable")
242
 
@@ -287,19 +353,58 @@ with tab1:
287
 
288
  st.markdown("---")
289
 
290
- # Processing config
291
  st.header("βš™οΈ Configuration")
292
 
293
- col1, col2 = st.columns(2)
294
 
295
- with col1:
296
- chunk_size = st.slider("Chunk size:", 50, 500, 150)
297
- chunk_overlap = st.slider("Overlap:", 0, 100, 30)
 
 
 
 
 
298
 
299
- with col2:
300
- current_model = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
301
- st.info(f"**Active Model:**\n{current_model}")
302
- use_context = st.checkbox("Use context for OCR", value=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
  st.markdown("---")
305
 
@@ -310,9 +415,10 @@ with tab1:
310
 
311
  with source_tabs[0]:
312
  folder_type = st.radio(
313
- "Folder:",
314
  ["πŸ“š Books", "πŸ“ Exams", "πŸ–ŠοΈ Answers (OCR)"],
315
- horizontal=True
 
316
  )
317
 
318
  if "Books" in folder_type:
@@ -322,8 +428,8 @@ with tab1:
322
  else:
323
  folder_path, doc_type = "answers/", "answer_handwritten"
324
 
325
- if st.button(f"πŸ” Scan {folder_path}"):
326
- with st.spinner("Scanning..."):
327
  files = list_dataset_files(folder_path)
328
 
329
  if files:
@@ -331,44 +437,122 @@ with tab1:
331
  for file in files:
332
  name = file.split('/')[-1]
333
  processed = check_if_processed(qdrant, name)
334
- file_status.append({"file": file, "name": name, "processed": processed})
 
 
 
 
 
 
335
 
336
  st.session_state.current_files = file_status
337
  st.session_state.current_folder = folder_path
338
  st.session_state.current_doc_type = doc_type
 
339
  else:
340
- st.warning("No files found")
341
 
 
342
  if 'current_files' in st.session_state and st.session_state.current_folder == folder_path:
343
 
344
  processed_count = sum(1 for f in st.session_state.current_files if f['processed'])
345
  pending_count = len(st.session_state.current_files) - processed_count
 
346
 
347
- col1, col2, col3 = st.columns(3)
348
- col1.metric("Total", len(st.session_state.current_files))
349
- col2.metric("βœ… Done", processed_count)
350
- col3.metric("⏳ Pending", pending_count)
 
 
351
 
352
- st.subheader("Select files:")
 
353
 
 
354
  selected_files = []
355
  for file_info in st.session_state.current_files:
356
- if file_info['processed']:
357
- st.checkbox(f"βœ… {file_info['name']}", value=False, disabled=True, key=f"f_{file_info['name']}")
358
- else:
359
- if st.checkbox(f"⏳ {file_info['name']}", value=True, key=f"f_{file_info['name']}"):
360
- selected_files.append(file_info)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
 
362
  if selected_files:
363
  st.markdown("---")
 
364
 
365
- if doc_type == "answer_handwritten":
366
- est_cost = len(selected_files) * 5 * 0.08
367
- st.warning(f"⚠️ OCR Cost: ~${est_cost:.2f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
- if st.button("πŸš€ PROCESS SELECTED", type="primary"):
 
370
 
371
- embedder = get_embedding_model(current_model)
 
372
 
373
  context_books = ""
374
  if doc_type == "answer_handwritten" and use_context:
@@ -387,56 +571,73 @@ with tab1:
387
 
388
  total_tokens = 0
389
  total_vectors = 0
 
390
 
391
- for file_info in selected_files:
392
- with st.expander(f"Processing {file_info['name']}", expanded=True):
 
 
 
 
 
 
 
 
 
393
  try:
394
  st.write("πŸ“₯ Downloading...")
395
  local_path = download_from_dataset(file_info['file'])
396
 
397
  if not local_path:
 
398
  continue
399
 
 
 
400
  if doc_type == "answer_handwritten":
401
- st.write("πŸ–ΌοΈ Converting...")
402
  images = pdf_to_images(local_path)
403
 
404
  if not images:
 
405
  continue
406
 
407
- st.write(f"βœ… {len(images)} pages")
408
 
409
  transcribed = []
410
  tokens = 0
411
 
412
  for i, img in enumerate(images, 1):
413
- st.write(f"πŸ€– OCR {i}/{len(images)}...")
414
  trans, tok = ocr_with_claude(claude, img, context_books)
415
  if trans:
416
  transcribed.append(f"\n=== Page {i} ===\n\n{trans}")
417
  tokens += tok
418
 
419
  if not transcribed:
420
- st.error("OCR failed")
421
  continue
422
 
423
  text = "\n\n".join(transcribed)
424
  total_tokens += tokens
425
- st.success(f"βœ… {len(text):,} chars (${tokens * 0.000003:.3f})")
426
 
427
  else:
428
- st.write("πŸ“– Extracting...")
429
  text = extract_text_from_pdf(local_path)
430
  if not text:
 
431
  continue
432
- st.write(f"βœ… {len(text):,} chars")
433
 
 
434
  chunks = chunk_text(text, chunk_size, chunk_overlap)
435
- st.write(f"βœ‚οΈ {len(chunks)} chunks")
436
 
437
- st.write("πŸ”’ Embedding...")
438
  embeddings = embedder.encode(chunks, show_progress_bar=False)
439
 
 
440
  points = []
441
  for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
442
  points.append(PointStruct(
@@ -452,37 +653,92 @@ with tab1:
452
 
453
  qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
454
  total_vectors += len(points)
455
- st.success(f"βœ… {len(points)} vectors!")
 
 
 
 
 
 
 
 
 
 
456
 
457
  except Exception as e:
458
- st.error(f"Error: {e}")
 
 
 
 
 
 
 
 
 
459
 
460
  st.balloons()
461
- st.success(f"Done! {total_vectors:,} vectors | ${total_tokens * 0.000003:.2f}")
462
- st.session_state.pop('current_files', None)
463
- st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
 
465
  with source_tabs[1]:
466
  dataset_choice = st.selectbox(
467
- "Dataset:",
468
- ["GSM8K - Grade School Math", "MATH - Competition Math", "MathQA - Word Problems"]
 
469
  )
470
 
471
- sample_size = st.slider("Samples:", 10, 2000, 100)
472
 
473
  dataset_name = dataset_choice.split(" - ")[0]
474
  already_loaded = check_if_processed(qdrant, dataset_name)
475
 
476
  if already_loaded:
477
- st.success(f"βœ… {dataset_name} loaded!")
 
478
  else:
479
- if st.button(f"πŸ“₯ Load {dataset_name}"):
 
 
480
  try:
481
  from datasets import load_dataset
482
 
483
- embedder = get_embedding_model(current_model)
 
484
 
485
- with st.spinner("Loading..."):
486
  if "GSM8K" in dataset_choice:
487
  dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
488
  texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
@@ -496,10 +752,12 @@ with tab1:
496
  texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
497
  for i in range(min(sample_size, len(dataset)))]
498
 
499
- st.write(f"βœ… {len(texts)} problems")
500
 
 
501
  embeddings = embedder.encode(texts, show_progress_bar=True)
502
 
 
503
  points = []
504
  for i, (text, emb) in enumerate(zip(texts, embeddings)):
505
  points.append(PointStruct(
@@ -514,11 +772,11 @@ with tab1:
514
  ))
515
 
516
  qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
517
- st.success(f"βœ… {len(points)} vectors!")
518
  st.balloons()
519
 
520
  except Exception as e:
521
- st.error(f"Error: {e}")
522
 
523
  # ============================================================================
524
  # TAB 2: SEARCH & SOLVE
@@ -528,21 +786,22 @@ with tab2:
528
  st.title("πŸ” Search & Solve")
529
 
530
  problem = st.text_area(
531
- "Problem:",
532
  placeholder="Find gradient of L(w) = (1/2)||Xw - y||Β²",
533
- height=150
 
534
  )
535
 
536
  col1, col2 = st.columns(2)
537
- col1.slider("Retrieve:", 3, 20, 5, key="top_k")
538
- col2.select_slider("Detail:", ["Concise", "Standard", "Detailed", "Exhaustive"], value="Detailed", key="detail")
539
 
540
- if st.button("πŸš€ SOLVE", type="primary") and problem:
541
 
542
- current_model = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
543
- embedder = get_embedding_model(current_model)
544
 
545
- with st.spinner("Searching..."):
546
  query_emb = embedder.encode(problem)
547
 
548
  try:
@@ -555,37 +814,41 @@ with tab2:
555
  results = []
556
 
557
  if not results:
558
- st.warning("No results. Load data in Dataset Manager.")
559
  else:
560
- st.success(f"Found {len(results)} references!")
561
 
562
- with st.expander("πŸ“š References"):
563
  for i, r in enumerate(results, 1):
564
- st.markdown(f"**{i}.** ({r.score*100:.0f}%)")
565
- st.text(r.payload['content'][:200] + "...")
566
- st.caption(f"Source: {r.payload.get('source_name')}")
 
567
 
568
- with st.spinner("Generating..."):
569
 
570
  context = "\n\n".join([r.payload['content'] for r in results])
571
 
572
- prompt = f"""Solve using references.
573
 
574
- PROBLEM: {problem}
 
575
 
576
- REFERENCES: {context}
 
577
 
578
- DETAIL: {st.session_state.detail}
 
 
579
 
580
- FORMAT:
581
  ## SOLUTION
582
- [Steps]
583
 
584
  ## REASONING
585
- [Why]
586
 
587
  ## REFERENCES
588
- [Sources]"""
589
 
590
  try:
591
  message = claude.messages.create(
@@ -595,25 +858,28 @@ FORMAT:
595
  )
596
 
597
  st.markdown("---")
 
598
  st.markdown(message.content[0].text)
599
 
600
  st.download_button(
601
- "πŸ“₯ Download",
602
  message.content[0].text,
603
- file_name=f"solution_{int(time.time())}.md"
 
604
  )
605
 
606
  except Exception as e:
607
- st.error(f"Error: {e}")
608
 
609
  # ============================================================================
610
  # TAB 3: STATISTICS
611
  # ============================================================================
612
 
613
  with tab3:
614
- st.title("πŸ“ˆ Statistics")
615
 
616
  try:
 
617
  sample = qdrant.scroll(
618
  collection_name=COLLECTION_NAME,
619
  limit=1000,
@@ -624,27 +890,42 @@ with tab3:
624
  if sample and sample[0]:
625
  types = {}
626
  sources = set()
 
627
 
628
  for point in sample[0]:
629
  src_type = point.payload.get('source_type', 'unknown')
 
 
630
  types[src_type] = types.get(src_type, 0) + 1
631
- sources.add(point.payload.get('source_name', 'Unknown'))
 
632
 
 
 
633
  col1, col2, col3 = st.columns(3)
634
- col1.metric("Vectors", get_vector_count(qdrant))
635
- col2.metric("Sources", len(sources))
636
- col3.metric("Types", len(types))
637
 
638
- st.subheader("Distribution")
639
- for doc_type, count in sorted(types.items()):
 
 
 
640
  pct = count / sum(types.values()) * 100
641
- st.progress(count / sum(types.values()), text=f"{doc_type}: {count} ({pct:.0f}%)")
 
 
642
 
643
- st.subheader("All Sources")
 
644
  for src in sorted(sources):
645
- st.caption(f"β€’ {src}")
 
 
 
646
 
647
  except Exception as e:
648
- st.error(f"Error: {e}")
649
 
650
- st.sidebar.caption("v2.0")
 
90
  except:
91
  return False
92
 
93
+ def get_file_vector_count(qdrant, file_name):
94
+ """Get number of vectors for a specific file"""
95
+ try:
96
+ count = 0
97
+ offset = None
98
+ for _ in range(100): # Safety limit
99
+ results = qdrant.scroll(
100
+ collection_name=COLLECTION_NAME,
101
+ scroll_filter={
102
+ "must": [{"key": "source_name", "match": {"value": file_name}}]
103
+ },
104
+ limit=100,
105
+ offset=offset,
106
+ with_payload=False,
107
+ with_vectors=False
108
+ )
109
+ if not results or not results[0]:
110
+ break
111
+ count += len(results[0])
112
+ offset = results[1]
113
+ if offset is None:
114
+ break
115
+ return count
116
+ except:
117
+ return 0
118
+
119
+ def estimate_chunks(pdf_path, chunk_size, overlap):
120
+ """Estimate number of chunks from a PDF"""
121
+ try:
122
+ with open(pdf_path, 'rb') as file:
123
+ reader = PyPDF2.PdfReader(file)
124
+ total_words = 0
125
+ for page in reader.pages:
126
+ text = page.extract_text()
127
+ total_words += len(text.split())
128
+
129
+ # Calculate estimated chunks
130
+ effective_chunk_size = chunk_size - overlap
131
+ estimated_chunks = max(1, (total_words - chunk_size) // effective_chunk_size + 1)
132
+ return estimated_chunks, total_words
133
+ except:
134
+ return 0, 0
135
+
136
  def list_dataset_files(folder_path):
137
  """List PDFs in HF Dataset folder"""
138
  try:
 
256
  except:
257
  return 0
258
 
259
+ # ============================================================================
260
+ # INITIALIZE SESSION STATE
261
+ # ============================================================================
262
+
263
+ if 'processing_complete' not in st.session_state:
264
+ st.session_state.processing_complete = False
265
+ if 'last_processed_files' not in st.session_state:
266
+ st.session_state.last_processed_files = []
267
+ if 'processing_stats' not in st.session_state:
268
+ st.session_state.processing_stats = {}
269
+
270
  # ============================================================================
271
  # INITIALIZE
272
  # ============================================================================
 
289
 
290
  try:
291
  vector_count = get_vector_count(qdrant)
292
+ st.sidebar.metric("πŸ“Š Total Vectors", f"{vector_count:,}")
293
+
294
+ # Get current embedding model
295
+ current_model_key = None
296
+ current_model_name = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
297
+ for key, value in EMBEDDING_MODELS.items():
298
+ if value["name"] == current_model_name:
299
+ current_model_key = key
300
+ break
301
+
302
+ if current_model_key:
303
+ dimensions = EMBEDDING_MODELS[current_model_key]["dimensions"]
304
+ storage_mb = (vector_count * dimensions * 4) / (1024 * 1024)
305
+ st.sidebar.metric("πŸ’Ύ Storage", f"{storage_mb:.1f} MB")
306
  except:
307
  st.sidebar.warning("DB unavailable")
308
 
 
353
 
354
  st.markdown("---")
355
 
356
+ # Processing configuration - ALWAYS VISIBLE
357
  st.header("βš™οΈ Configuration")
358
 
359
+ config_col1, config_col2 = st.columns(2)
360
 
361
+ with config_col1:
362
+ st.subheader("Chunking Settings")
363
+ chunk_size = st.slider("Chunk size (words):", 50, 500, 150, key="chunk_size_slider")
364
+ chunk_overlap = st.slider("Overlap (words):", 0, 100, 30, key="chunk_overlap_slider")
365
+
366
+ # Show effective chunk size
367
+ effective_size = chunk_size - chunk_overlap
368
+ st.caption(f"πŸ“ Effective chunk: {effective_size} words")
369
 
370
+ with config_col2:
371
+ st.subheader("Embedding Model")
372
+
373
+ # Get current model
374
+ current_model_name = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
375
+ current_model_key = None
376
+ for key, value in EMBEDDING_MODELS.items():
377
+ if value["name"] == current_model_name:
378
+ current_model_key = key
379
+ break
380
+
381
+ if not current_model_key:
382
+ current_model_key = "MiniLM-L6 (Fast, 384D)"
383
+
384
+ selected_embedding = st.selectbox(
385
+ "Select model:",
386
+ list(EMBEDDING_MODELS.keys()),
387
+ index=list(EMBEDDING_MODELS.keys()).index(current_model_key),
388
+ key="embedding_selector"
389
+ )
390
+
391
+ # Display model info
392
+ model_info = EMBEDDING_MODELS[selected_embedding]
393
+ st.info(f"""
394
+ **Active Model:** {selected_embedding}
395
+ - **Dimensions:** {model_info['dimensions']}D
396
+ - **Speed:** {model_info['speed']}
397
+ - **Quality:** {model_info['quality']}
398
+ """)
399
+
400
+ # Update session state
401
+ if st.session_state.embedding_model != model_info['name']:
402
+ if st.button("πŸ”„ Apply Model Change"):
403
+ st.session_state.embedding_model = model_info['name']
404
+ st.success("Model updated! New uploads will use this model.")
405
+ st.rerun()
406
+
407
+ use_context = st.checkbox("Use context for OCR", value=True, key="use_context_checkbox")
408
 
409
  st.markdown("---")
410
 
 
415
 
416
  with source_tabs[0]:
417
  folder_type = st.radio(
418
+ "Select folder type:",
419
  ["πŸ“š Books", "πŸ“ Exams", "πŸ–ŠοΈ Answers (OCR)"],
420
+ horizontal=True,
421
+ key="folder_type_radio"
422
  )
423
 
424
  if "Books" in folder_type:
 
428
  else:
429
  folder_path, doc_type = "answers/", "answer_handwritten"
430
 
431
+ if st.button(f"πŸ” Scan {folder_path}", key="scan_button"):
432
+ with st.spinner("Scanning HuggingFace dataset..."):
433
  files = list_dataset_files(folder_path)
434
 
435
  if files:
 
437
  for file in files:
438
  name = file.split('/')[-1]
439
  processed = check_if_processed(qdrant, name)
440
+ vector_count_file = get_file_vector_count(qdrant, name) if processed else 0
441
+ file_status.append({
442
+ "file": file,
443
+ "name": name,
444
+ "processed": processed,
445
+ "vectors": vector_count_file
446
+ })
447
 
448
  st.session_state.current_files = file_status
449
  st.session_state.current_folder = folder_path
450
  st.session_state.current_doc_type = doc_type
451
+ st.session_state.processing_complete = False
452
  else:
453
+ st.warning(f"No PDF files found in {folder_path}")
454
 
455
+ # Display files if scanned
456
  if 'current_files' in st.session_state and st.session_state.current_folder == folder_path:
457
 
458
  processed_count = sum(1 for f in st.session_state.current_files if f['processed'])
459
  pending_count = len(st.session_state.current_files) - processed_count
460
+ total_vectors = sum(f['vectors'] for f in st.session_state.current_files)
461
 
462
+ # Summary metrics
463
+ metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4)
464
+ metric_col1.metric("πŸ“ Total Files", len(st.session_state.current_files))
465
+ metric_col2.metric("βœ… Processed", processed_count)
466
+ metric_col3.metric("⏳ Pending", pending_count)
467
+ metric_col4.metric("πŸ”’ Vectors", f"{total_vectors:,}")
468
 
469
+ st.markdown("---")
470
+ st.subheader("File Status & Selection")
471
 
472
+ # File selection with status
473
  selected_files = []
474
  for file_info in st.session_state.current_files:
475
+ col1, col2, col3 = st.columns([3, 1, 1])
476
+
477
+ with col1:
478
+ if file_info['processed']:
479
+ checkbox_label = f"βœ… {file_info['name']}"
480
+ is_selected = st.checkbox(
481
+ checkbox_label,
482
+ value=False,
483
+ disabled=True,
484
+ key=f"file_{file_info['name']}"
485
+ )
486
+ else:
487
+ checkbox_label = f"⏳ {file_info['name']}"
488
+ is_selected = st.checkbox(
489
+ checkbox_label,
490
+ value=True,
491
+ key=f"file_{file_info['name']}"
492
+ )
493
+ if is_selected:
494
+ selected_files.append(file_info)
495
+
496
+ with col2:
497
+ if file_info['processed']:
498
+ st.caption(f"πŸ”’ {file_info['vectors']} vectors")
499
+ else:
500
+ st.caption("Not uploaded")
501
+
502
+ with col3:
503
+ if file_info['processed']:
504
+ status_color = "🟒"
505
+ else:
506
+ status_color = "πŸ”΄"
507
+ st.caption(status_color)
508
 
509
+ # Sizing estimation for selected files
510
  if selected_files:
511
  st.markdown("---")
512
+ st.subheader("πŸ“Š Processing Preview")
513
 
514
+ # Download one file to estimate
515
+ sample_file = selected_files[0]
516
+ with st.spinner("Calculating estimates..."):
517
+ local_path = download_from_dataset(sample_file['file'])
518
+ if local_path:
519
+ est_chunks, est_words = estimate_chunks(local_path, chunk_size, chunk_overlap)
520
+
521
+ # Calculate totals
522
+ total_est_chunks = est_chunks * len(selected_files)
523
+ total_est_words = est_words * len(selected_files)
524
+
525
+ # Get embedding dimensions
526
+ current_model_name = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
527
+ dimensions = 384 # default
528
+ for key, value in EMBEDDING_MODELS.items():
529
+ if value["name"] == current_model_name:
530
+ dimensions = value["dimensions"]
531
+ break
532
+
533
+ est_storage_mb = (total_est_chunks * dimensions * 4) / (1024 * 1024)
534
+
535
+ # Display estimates
536
+ est_col1, est_col2, est_col3, est_col4 = st.columns(4)
537
+ est_col1.metric("πŸ“„ Files", len(selected_files))
538
+ est_col2.metric("πŸ“ Est. Words", f"{total_est_words:,}")
539
+ est_col3.metric("βœ‚οΈ Est. Chunks", f"{total_est_chunks:,}")
540
+ est_col4.metric("πŸ’Ύ Est. Storage", f"{est_storage_mb:.2f} MB")
541
+
542
+ # OCR cost estimation
543
+ if doc_type == "answer_handwritten":
544
+ # Estimate ~5 pages per exam, $0.08 per page
545
+ est_pages = len(selected_files) * 5
546
+ est_cost = est_pages * 0.08
547
+ st.warning(f"⚠️ **OCR Processing Cost Estimate:** ~${est_cost:.2f} ({est_pages} pages Γ— $0.08/page)")
548
+
549
+ st.markdown("---")
550
 
551
+ # Process button
552
+ if st.button("πŸš€ PROCESS SELECTED FILES", type="primary", key="process_button"):
553
 
554
+ current_model_name = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
555
+ embedder = get_embedding_model(current_model_name)
556
 
557
  context_books = ""
558
  if doc_type == "answer_handwritten" and use_context:
 
571
 
572
  total_tokens = 0
573
  total_vectors = 0
574
+ processing_stats = {}
575
 
576
+ # Create progress tracking
577
+ progress_bar = st.progress(0)
578
+ status_text = st.empty()
579
+
580
+ for idx, file_info in enumerate(selected_files):
581
+ # Update progress
582
+ progress = (idx) / len(selected_files)
583
+ progress_bar.progress(progress)
584
+ status_text.text(f"Processing {idx + 1}/{len(selected_files)}: {file_info['name']}")
585
+
586
+ with st.expander(f"πŸ“„ {file_info['name']}", expanded=True):
587
  try:
588
  st.write("πŸ“₯ Downloading...")
589
  local_path = download_from_dataset(file_info['file'])
590
 
591
  if not local_path:
592
+ st.error("❌ Download failed")
593
  continue
594
 
595
+ file_start_time = time.time()
596
+
597
  if doc_type == "answer_handwritten":
598
+ st.write("πŸ–ΌοΈ Converting to images...")
599
  images = pdf_to_images(local_path)
600
 
601
  if not images:
602
+ st.error("❌ Conversion failed")
603
  continue
604
 
605
+ st.write(f"βœ… Converted {len(images)} pages")
606
 
607
  transcribed = []
608
  tokens = 0
609
 
610
  for i, img in enumerate(images, 1):
611
+ st.write(f"πŸ€– OCR page {i}/{len(images)}...")
612
  trans, tok = ocr_with_claude(claude, img, context_books)
613
  if trans:
614
  transcribed.append(f"\n=== Page {i} ===\n\n{trans}")
615
  tokens += tok
616
 
617
  if not transcribed:
618
+ st.error("❌ OCR failed")
619
  continue
620
 
621
  text = "\n\n".join(transcribed)
622
  total_tokens += tokens
623
+ st.success(f"βœ… Transcribed {len(text):,} characters (Cost: ${tokens * 0.000003:.3f})")
624
 
625
  else:
626
+ st.write("πŸ“– Extracting text...")
627
  text = extract_text_from_pdf(local_path)
628
  if not text:
629
+ st.error("❌ Extraction failed")
630
  continue
631
+ st.write(f"βœ… Extracted {len(text):,} characters")
632
 
633
+ st.write("βœ‚οΈ Chunking text...")
634
  chunks = chunk_text(text, chunk_size, chunk_overlap)
635
+ st.write(f"βœ… Created {len(chunks)} chunks")
636
 
637
+ st.write("πŸ”’ Generating embeddings...")
638
  embeddings = embedder.encode(chunks, show_progress_bar=False)
639
 
640
+ st.write("πŸ’Ύ Uploading to vector database...")
641
  points = []
642
  for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
643
  points.append(PointStruct(
 
653
 
654
  qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
655
  total_vectors += len(points)
656
+
657
+ file_time = time.time() - file_start_time
658
+ st.success(f"βœ… Uploaded {len(points)} vectors in {file_time:.1f}s!")
659
+
660
+ # Store stats
661
+ processing_stats[file_info['name']] = {
662
+ 'vectors': len(points),
663
+ 'chunks': len(chunks),
664
+ 'time': file_time,
665
+ 'tokens': tokens if doc_type == "answer_handwritten" else 0
666
+ }
667
 
668
  except Exception as e:
669
+ st.error(f"❌ Error: {e}")
670
+
671
+ # Complete progress
672
+ progress_bar.progress(1.0)
673
+ status_text.text(f"βœ… Completed! Processed {len(selected_files)} files")
674
+
675
+ # Store results in session state
676
+ st.session_state.processing_complete = True
677
+ st.session_state.last_processed_files = selected_files
678
+ st.session_state.processing_stats = processing_stats
679
 
680
  st.balloons()
681
+
682
+ # Final summary (persistent)
683
+ st.markdown("---")
684
+ st.success(f"πŸŽ‰ **Processing Complete!**")
685
+
686
+ summary_col1, summary_col2, summary_col3, summary_col4 = st.columns(4)
687
+ summary_col1.metric("πŸ“ Files", len(selected_files))
688
+ summary_col2.metric("πŸ”’ Vectors", f"{total_vectors:,}")
689
+ if total_tokens > 0:
690
+ summary_col3.metric("πŸ’° Cost", f"${total_tokens * 0.000003:.2f}")
691
+ summary_col4.metric("βœ… Status", "Success")
692
+
693
+ # Show persistent results if processing was completed
694
+ elif st.session_state.processing_complete and st.session_state.processing_stats:
695
+ st.markdown("---")
696
+ st.info("ℹ️ Last processing session completed. Results shown below.")
697
+
698
+ st.subheader("πŸ“Š Processing Results")
699
+
700
+ total_vectors = sum(stat['vectors'] for stat in st.session_state.processing_stats.values())
701
+ total_tokens = sum(stat['tokens'] for stat in st.session_state.processing_stats.values())
702
+
703
+ result_col1, result_col2, result_col3, result_col4 = st.columns(4)
704
+ result_col1.metric("πŸ“ Files", len(st.session_state.processing_stats))
705
+ result_col2.metric("πŸ”’ Vectors", f"{total_vectors:,}")
706
+ if total_tokens > 0:
707
+ result_col3.metric("πŸ’° Cost", f"${total_tokens * 0.000003:.2f}")
708
+ result_col4.metric("βœ… Status", "Complete")
709
+
710
+ # Detailed breakdown
711
+ with st.expander("πŸ“‹ Detailed Breakdown"):
712
+ for filename, stats in st.session_state.processing_stats.items():
713
+ st.markdown(f"**{filename}**")
714
+ st.caption(f"Vectors: {stats['vectors']:,} | Chunks: {stats['chunks']} | Time: {stats['time']:.1f}s")
715
 
716
  with source_tabs[1]:
717
  dataset_choice = st.selectbox(
718
+ "Select public dataset:",
719
+ ["GSM8K - Grade School Math", "MATH - Competition Math", "MathQA - Word Problems"],
720
+ key="dataset_selector"
721
  )
722
 
723
+ sample_size = st.slider("Number of samples:", 10, 2000, 100, key="sample_size_slider")
724
 
725
  dataset_name = dataset_choice.split(" - ")[0]
726
  already_loaded = check_if_processed(qdrant, dataset_name)
727
 
728
  if already_loaded:
729
+ vectors_count = get_file_vector_count(qdrant, dataset_name)
730
+ st.success(f"βœ… **{dataset_name}** already loaded with {vectors_count:,} vectors!")
731
  else:
732
+ st.info(f"πŸ“₯ {dataset_name} not yet loaded")
733
+
734
+ if st.button(f"πŸ“₯ Load {dataset_name}", type="primary", key="load_dataset_button"):
735
  try:
736
  from datasets import load_dataset
737
 
738
+ current_model_name = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
739
+ embedder = get_embedding_model(current_model_name)
740
 
741
+ with st.spinner(f"Loading {dataset_name}..."):
742
  if "GSM8K" in dataset_choice:
743
  dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
744
  texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
 
752
  texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
753
  for i in range(min(sample_size, len(dataset)))]
754
 
755
+ st.write(f"βœ… Loaded {len(texts)} problems")
756
 
757
+ st.write("πŸ”’ Generating embeddings...")
758
  embeddings = embedder.encode(texts, show_progress_bar=True)
759
 
760
+ st.write("πŸ’Ύ Uploading to vector database...")
761
  points = []
762
  for i, (text, emb) in enumerate(zip(texts, embeddings)):
763
  points.append(PointStruct(
 
772
  ))
773
 
774
  qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
775
+ st.success(f"βœ… Uploaded {len(points)} vectors!")
776
  st.balloons()
777
 
778
  except Exception as e:
779
+ st.error(f"❌ Error: {e}")
780
 
781
  # ============================================================================
782
  # TAB 2: SEARCH & SOLVE
 
786
  st.title("πŸ” Search & Solve")
787
 
788
  problem = st.text_area(
789
+ "Enter your math problem:",
790
  placeholder="Find gradient of L(w) = (1/2)||Xw - y||Β²",
791
+ height=150,
792
+ key="problem_input"
793
  )
794
 
795
  col1, col2 = st.columns(2)
796
+ col1.slider("Retrieve top K:", 3, 20, 5, key="top_k")
797
+ col2.select_slider("Detail level:", ["Concise", "Standard", "Detailed", "Exhaustive"], value="Detailed", key="detail")
798
 
799
+ if st.button("πŸš€ SOLVE", type="primary", key="solve_button") and problem:
800
 
801
+ current_model_name = st.session_state.get('embedding_model', EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"])
802
+ embedder = get_embedding_model(current_model_name)
803
 
804
+ with st.spinner("Searching knowledge base..."):
805
  query_emb = embedder.encode(problem)
806
 
807
  try:
 
814
  results = []
815
 
816
  if not results:
817
+ st.warning("⚠️ No results found. Please load data in Dataset Manager first.")
818
  else:
819
+ st.success(f"βœ… Found {len(results)} relevant references!")
820
 
821
+ with st.expander("πŸ“š Retrieved References", expanded=False):
822
  for i, r in enumerate(results, 1):
823
+ st.markdown(f"**Reference {i}** (Relevance: {r.score*100:.1f}%)")
824
+ st.text(r.payload['content'][:300] + "...")
825
+ st.caption(f"πŸ“ Source: {r.payload.get('source_name')} | Type: {r.payload.get('source_type')}")
826
+ st.markdown("---")
827
 
828
+ with st.spinner("Generating solution with Claude..."):
829
 
830
  context = "\n\n".join([r.payload['content'] for r in results])
831
 
832
+ prompt = f"""Solve the following math problem using the provided references.
833
 
834
+ PROBLEM:
835
+ {problem}
836
 
837
+ REFERENCES:
838
+ {context}
839
 
840
+ DETAIL LEVEL: {st.session_state.detail}
841
+
842
+ Please provide your response in the following format:
843
 
 
844
  ## SOLUTION
845
+ [Step-by-step solution]
846
 
847
  ## REASONING
848
+ [Explain why you solved it this way]
849
 
850
  ## REFERENCES
851
+ [Cite which sources you used]"""
852
 
853
  try:
854
  message = claude.messages.create(
 
858
  )
859
 
860
  st.markdown("---")
861
+ st.markdown("## πŸ“ Solution")
862
  st.markdown(message.content[0].text)
863
 
864
  st.download_button(
865
+ "πŸ“₯ Download Solution",
866
  message.content[0].text,
867
+ file_name=f"solution_{int(time.time())}.md",
868
+ mime="text/markdown"
869
  )
870
 
871
  except Exception as e:
872
+ st.error(f"❌ Error generating solution: {e}")
873
 
874
  # ============================================================================
875
  # TAB 3: STATISTICS
876
  # ============================================================================
877
 
878
  with tab3:
879
+ st.title("πŸ“ˆ Database Statistics")
880
 
881
  try:
882
+ # Get sample of all data
883
  sample = qdrant.scroll(
884
  collection_name=COLLECTION_NAME,
885
  limit=1000,
 
890
  if sample and sample[0]:
891
  types = {}
892
  sources = set()
893
+ source_vectors = {}
894
 
895
  for point in sample[0]:
896
  src_type = point.payload.get('source_type', 'unknown')
897
+ src_name = point.payload.get('source_name', 'Unknown')
898
+
899
  types[src_type] = types.get(src_type, 0) + 1
900
+ sources.add(src_name)
901
+ source_vectors[src_name] = source_vectors.get(src_name, 0) + 1
902
 
903
+ # Overall metrics
904
+ total_vectors = get_vector_count(qdrant)
905
  col1, col2, col3 = st.columns(3)
906
+ col1.metric("πŸ“Š Total Vectors", f"{total_vectors:,}")
907
+ col2.metric("πŸ“ Unique Sources", len(sources))
908
+ col3.metric("πŸ“‚ Document Types", len(types))
909
 
910
+ st.markdown("---")
911
+
912
+ # Distribution by type
913
+ st.subheader("πŸ“Š Distribution by Document Type")
914
+ for doc_type, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
915
  pct = count / sum(types.values()) * 100
916
+ st.progress(count / sum(types.values()), text=f"{doc_type}: {count:,} vectors ({pct:.1f}%)")
917
+
918
+ st.markdown("---")
919
 
920
+ # All sources
921
+ st.subheader("πŸ“š All Data Sources")
922
  for src in sorted(sources):
923
+ vector_count = source_vectors.get(src, 0)
924
+ st.caption(f"β€’ **{src}** - {vector_count:,} vectors")
925
+ else:
926
+ st.info("πŸ“­ No data in database yet. Upload some files in the Dataset Manager!")
927
 
928
  except Exception as e:
929
+ st.error(f"❌ Error loading statistics: {e}")
930
 
931
+ st.sidebar.caption("Powered by Claude AI")