Spaces:
Running
Running
File size: 60,127 Bytes
00ba8a4 72cf124 3f8fbec 3ebdd9a 3508eff 3ebdd9a 3508eff 3ebdd9a 3f8fbec 0c74bf5 3f8fbec d9f0bf7 0c74bf5 c2aa208 4b38ac0 3331648 4b38ac0 72cf124 3331648 3ebdd9a d9f0bf7 72cf124 4e707f6 3f8fbec e29990f 3f8fbec c2aa208 0c74bf5 c2aa208 0c74bf5 c2aa208 0c74bf5 3508eff 3331648 3508eff 3331648 3508eff d9f0bf7 a02e988 0c74bf5 a02e988 6d08aa2 3ebdd9a 6d08aa2 3ebdd9a 3508eff 3ebdd9a 3508eff 3f8fbec 0c74bf5 c2aa208 3ebdd9a 3508eff 3ebdd9a 0c74bf5 3508eff 0c74bf5 3508eff 0c74bf5 3508eff 0c74bf5 3508eff 3fbb8fa 0c74bf5 3fbb8fa 0c74bf5 3fbb8fa d9f0bf7 3331648 3f8fbec d9f0bf7 3331648 3f8fbec 3331648 d9f0bf7 3331648 bd94ae2 d9f0bf7 3508eff d9f0bf7 3508eff d9f0bf7 3331648 d9f0bf7 3331648 d9f0bf7 bd94ae2 3331648 bd94ae2 3331648 bd94ae2 3331648 bd94ae2 d9f0bf7 c2aa208 bd94ae2 c2aa208 bd94ae2 3331648 bd94ae2 d9f0bf7 bd94ae2 3508eff bd94ae2 3ebdd9a 3331648 3ebdd9a 3331648 3f8fbec 0c74bf5 3f8fbec 3331648 3f8fbec 6d08aa2 3ebdd9a 3508eff 3ebdd9a 3331648 3ebdd9a 989d169 bd94ae2 989d169 3331648 c2aa208 989d169 3ebdd9a 3fbb8fa a02e988 3fbb8fa 0c74bf5 3ebdd9a 3331648 3508eff 989d169 0c74bf5 c2aa208 0c74bf5 c2aa208 0c74bf5 c2aa208 0c74bf5 c2aa208 3f8fbec 3331648 3f8fbec 3331648 3508eff 3331648 3508eff c89d7a4 3508eff 3331648 3508eff c89d7a4 3508eff 3331648 3508eff 3331648 3ebdd9a 3331648 3508eff 3331648 3ebdd9a 3508eff d9f0bf7 3fbb8fa 3331648 d9f0bf7 3fbb8fa d9f0bf7 3fbb8fa 3ebdd9a 3fbb8fa a02e988 3fbb8fa a02e988 3fbb8fa c2aa208 3ebdd9a 0c74bf5 c2aa208 0c74bf5 3508eff 3331648 bd94ae2 3331648 3ebdd9a 3508eff 3fbb8fa 3331648 3fbb8fa 3508eff 3331648 3508eff 3331648 3508eff 3331648 3508eff 0c74bf5 3508eff 0c74bf5 c2aa208 0c74bf5 c2aa208 0c74bf5 c2aa208 0c74bf5 3ebdd9a 3fbb8fa 3508eff 3fbb8fa 0c74bf5 c2aa208 3508eff 3fbb8fa c2aa208 3fbb8fa c2aa208 3508eff 3fbb8fa 3508eff 3fbb8fa 3508eff c2aa208 3fbb8fa 0c74bf5 3fbb8fa 0c74bf5 3fbb8fa c2aa208 0c74bf5 c2aa208 0c74bf5 c2aa208 0c74bf5 3fbb8fa c2aa208 3508eff 3fbb8fa 3508eff 3fbb8fa d9f0bf7 0c74bf5 3fbb8fa 0c74bf5 3fbb8fa a02e988 3fbb8fa c2aa208 0c74bf5 3fbb8fa c2aa208 0c74bf5 c2aa208 0c74bf5 c2aa208 3fbb8fa c2aa208 3fbb8fa 3508eff 3fbb8fa 3f8fbec a02e988 3fbb8fa 3508eff c2aa208 3508eff 3331648 3508eff c2aa208 3508eff c2aa208 3508eff c2aa208 3fbb8fa 3508eff 3fbb8fa 3508eff c2aa208 0c74bf5 c2aa208 0c74bf5 c2aa208 0c74bf5 c2aa208 0c74bf5 c2aa208 3508eff 3fbb8fa 3508eff 3fbb8fa c2aa208 3fbb8fa 3508eff c2aa208 3508eff c2aa208 3508eff c2aa208 3508eff c2aa208 3508eff 3fbb8fa 3508eff 3fbb8fa 3508eff 3fbb8fa 3508eff 3fbb8fa 3508eff 3fbb8fa 3508eff 3fbb8fa 3508eff 0c74bf5 c2aa208 3508eff 3fbb8fa c2aa208 3fbb8fa 3508eff 3fbb8fa 3508eff 3fbb8fa c2aa208 3fbb8fa c2aa208 0c74bf5 3fbb8fa c2aa208 3fbb8fa c2aa208 3fbb8fa c2aa208 3fbb8fa c2aa208 3fbb8fa c2aa208 0c74bf5 c2aa208 3ebdd9a 3508eff 3fbb8fa 3508eff 3fbb8fa 3508eff 0c74bf5 3508eff 3fbb8fa 3508eff 3fbb8fa 3508eff d9f0bf7 a02e988 3fbb8fa 3508eff 3fbb8fa 3508eff 3331648 3508eff d9f0bf7 3fbb8fa 3508eff 3fbb8fa 3508eff d9f0bf7 3fbb8fa d9f0bf7 3508eff d9f0bf7 3508eff d9f0bf7 3508eff 0c74bf5 d9f0bf7 3fbb8fa 3508eff d9f0bf7 3508eff 3fbb8fa c2aa208 6d08aa2 3508eff 6d08aa2 3508eff 3ebdd9a 3508eff 3fbb8fa 3331648 3fbb8fa 3508eff 3fbb8fa 3508eff 3fbb8fa 3f8fbec a02e988 3fbb8fa 3508eff 3fbb8fa 3508eff d9f0bf7 3508eff 3331648 3508eff 3fbb8fa 3508eff 3fbb8fa 3ebdd9a 3fbb8fa 3508eff 3fbb8fa d9f0bf7 3fbb8fa 3508eff 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 0c74bf5 6e1b1a8 3fbb8fa 0c74bf5 3fbb8fa 6e1b1a8 3fbb8fa 0c74bf5 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 0c74bf5 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 3fbb8fa 6e1b1a8 c2aa208 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 |
import streamlit as st
import os
import time
import base64
import hashlib
from io import BytesIO
from PIL import Image
import PyPDF2
from pdf2image import convert_from_path
from anthropic import Anthropic
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download, list_repo_files
from pathlib import Path
import shutil
import json
import pickle
# ============================================================================
# PRODUCTION MATH AI SYSTEM
# ============================================================================
st.set_page_config(
page_title="Math AI System",
page_icon="π",
layout="wide"
)
COLLECTION_NAME = "math_knowledge_base"
DATASET_REPO = "Hebaelsayed/math-ai-documents" # β CHANGE THIS!
# Cache directories
CACHE_DIR = Path("/tmp/hf_cache")
OCR_CACHE_DIR = Path("/tmp/ocr_cache")
CACHE_DIR.mkdir(exist_ok=True)
OCR_CACHE_DIR.mkdir(exist_ok=True)
# ============================================================================
# EMBEDDING MODELS
# ============================================================================
EMBEDDING_MODELS = {
"MiniLM-L6 (Fast, 384D)": {
"name": "sentence-transformers/all-MiniLM-L6-v2",
"dimensions": 384,
"speed": "Fast",
"quality": "Good"
},
"MiniLM-L12 (Balanced, 384D)": {
"name": "sentence-transformers/all-MiniLM-L12-v2",
"dimensions": 384,
"speed": "Medium",
"quality": "Better"
},
"MPNet (Best, 768D)": {
"name": "sentence-transformers/all-mpnet-base-v2",
"dimensions": 768,
"speed": "Slower",
"quality": "Excellent"
}
}
# ============================================================================
# INITIALIZE SESSION STATE
# ============================================================================
if 'processing_complete' not in st.session_state:
st.session_state.processing_complete = False
if 'last_processed_files' not in st.session_state:
st.session_state.last_processed_files = []
if 'processing_stats' not in st.session_state:
st.session_state.processing_stats = {}
if 'embedding_model' not in st.session_state:
st.session_state.embedding_model = EMBEDDING_MODELS["MiniLM-L6 (Fast, 384D)"]["name"]
# ============================================================================
# CACHED RESOURCES
# ============================================================================
@st.cache_resource
def get_qdrant_client():
return QdrantClient(
url=os.getenv("QDRANT_URL"),
api_key=os.getenv("QDRANT_API_KEY")
)
@st.cache_resource
def get_claude_client():
return Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
@st.cache_resource
def get_embedding_model(model_name):
return SentenceTransformer(model_name)
# ============================================================================
# CACHE MANAGEMENT FUNCTIONS
# ============================================================================
def get_cache_path(file_path):
"""Generate cache file path based on HF file path"""
file_hash = hashlib.md5(file_path.encode()).hexdigest()
return CACHE_DIR / f"{file_hash}.pdf"
def is_file_cached(file_path):
"""Check if file is already downloaded and cached"""
cache_path = get_cache_path(file_path)
return cache_path.exists()
def get_cached_file(file_path):
"""Get cached file path"""
cache_path = get_cache_path(file_path)
if cache_path.exists():
return str(cache_path)
return None
def download_with_cache(file_path):
"""Download file with caching - only downloads if not cached"""
# Check cache first
cached = get_cached_file(file_path)
if cached:
return cached, True # Return path and cache_hit=True
# Download if not cached
try:
hf_token = os.getenv("HF_TOKEN")
downloaded_path = hf_hub_download(
repo_id=DATASET_REPO,
filename=file_path,
repo_type="dataset",
token=hf_token
)
# Copy to cache
cache_path = get_cache_path(file_path)
shutil.copy(downloaded_path, cache_path)
return str(cache_path), False # Return path and cache_hit=False
except Exception as e:
st.error(f"Download error: {e}")
return None, False
def clear_cache():
"""Clear all cached downloads"""
if CACHE_DIR.exists():
shutil.rmtree(CACHE_DIR)
CACHE_DIR.mkdir(exist_ok=True)
return True
def get_cache_size():
"""Get total cache size in MB"""
total_size = 0
if CACHE_DIR.exists():
for file in CACHE_DIR.glob("*.pdf"):
total_size += file.stat().st_size
return total_size / (1024 * 1024) # Convert to MB
# ============================================================================
# OCR CACHE FUNCTIONS - CRITICAL FOR COST SAVINGS
# ============================================================================
def get_ocr_cache_path(file_name):
"""Generate OCR cache file path based on filename"""
# Create hash for unique cache filename
file_hash = hashlib.md5(file_name.encode()).hexdigest()
return OCR_CACHE_DIR / f"{file_hash}.json"
def is_ocr_cached(file_name):
"""Check if OCR result is already cached"""
cache_path = get_ocr_cache_path(file_name)
return cache_path.exists()
def save_ocr_to_cache(file_name, transcribed_text, total_tokens):
"""Save OCR result to cache"""
try:
cache_path = get_ocr_cache_path(file_name)
cache_data = {
"file_name": file_name,
"transcribed_text": transcribed_text,
"total_tokens": total_tokens,
"timestamp": time.time(),
"cost": total_tokens * 0.000003
}
with open(cache_path, 'w', encoding='utf-8') as f:
json.dump(cache_data, f, ensure_ascii=False, indent=2)
return True
except Exception as e:
st.warning(f"Could not save OCR cache: {e}")
return False
def load_ocr_from_cache(file_name):
"""Load OCR result from cache"""
try:
cache_path = get_ocr_cache_path(file_name)
if not cache_path.exists():
return None, 0
with open(cache_path, 'r', encoding='utf-8') as f:
cache_data = json.load(f)
return cache_data.get('transcribed_text'), cache_data.get('total_tokens', 0)
except Exception as e:
st.warning(f"Could not load OCR cache: {e}")
return None, 0
def clear_ocr_cache():
"""Clear all OCR cache"""
if OCR_CACHE_DIR.exists():
shutil.rmtree(OCR_CACHE_DIR)
OCR_CACHE_DIR.mkdir(exist_ok=True)
return True
def get_ocr_cache_size():
"""Get total OCR cache size in MB"""
total_size = 0
if OCR_CACHE_DIR.exists():
for file in OCR_CACHE_DIR.glob("*.json"):
total_size += file.stat().st_size
return total_size / (1024 * 1024) # Convert to MB
def get_ocr_cache_stats():
"""Get OCR cache statistics"""
total_files = 0
total_cost_saved = 0.0
if OCR_CACHE_DIR.exists():
for file in OCR_CACHE_DIR.glob("*.json"):
try:
with open(file, 'r') as f:
data = json.load(f)
total_files += 1
total_cost_saved += data.get('cost', 0)
except:
pass
return total_files, total_cost_saved
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
def check_if_processed(qdrant, file_name, chunk_size=None, embedding_model=None, strategy="filename_only"):
"""
Check if file already processed based on strategy
Strategies:
- "filename_only": Check only by filename
- "filename_settings": Check filename + chunk_size
- "filename_full": Check filename + chunk_size + embedding_model
"""
try:
# Check if collection has any data
try:
collection_info = qdrant.get_collection(collection_name=COLLECTION_NAME)
if collection_info.points_count == 0:
return False, 0
except:
return False, 0
# Build filter based on strategy
filter_conditions = [
FieldCondition(key="source_name", match=MatchValue(value=file_name))
]
if strategy in ["filename_settings", "filename_full"]:
if chunk_size is not None:
filter_conditions.append(
FieldCondition(key="chunk_size", match=MatchValue(value=chunk_size))
)
if strategy == "filename_full":
if embedding_model is not None:
filter_conditions.append(
FieldCondition(key="embedding_model", match=MatchValue(value=embedding_model))
)
# Count matching vectors
count_result = qdrant.count(
collection_name=COLLECTION_NAME,
count_filter=Filter(must=filter_conditions)
)
return count_result.count > 0, count_result.count
except Exception as e:
return False, 0
def get_file_vector_count(qdrant, file_name):
"""Get number of vectors for a specific file"""
try:
count_result = qdrant.count(
collection_name=COLLECTION_NAME,
count_filter=Filter(
must=[
FieldCondition(key="source_name", match=MatchValue(value=file_name))
]
)
)
return count_result.count
except:
return 0
def estimate_chunks(pdf_path, chunk_size, overlap):
"""Estimate number of chunks from a PDF"""
try:
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
total_words = 0
for page in reader.pages:
text = page.extract_text()
total_words += len(text.split())
# Calculate estimated chunks
effective_chunk_size = chunk_size - overlap
estimated_chunks = max(1, (total_words - chunk_size) // effective_chunk_size + 1)
return estimated_chunks, total_words
except:
return 0, 0
def list_dataset_files(folder_path):
"""List PDFs in HF Dataset folder"""
try:
hf_token = os.getenv("HF_TOKEN")
all_files = list_repo_files(
repo_id=DATASET_REPO,
repo_type="dataset",
token=hf_token
)
return [f for f in all_files if f.startswith(folder_path) and f.endswith('.pdf')]
except Exception as e:
st.error(f"Error listing: {e}")
return []
def extract_text_from_pdf(pdf_path):
"""Extract text from PDF"""
try:
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page_num, page in enumerate(reader.pages):
text += f"\n\n=== Page {page_num + 1} ===\n\n{page.extract_text()}"
return text
except Exception as e:
st.error(f"Extraction error: {e}")
return None
def pdf_to_images(pdf_path):
"""Convert PDF to images"""
try:
images = convert_from_path(pdf_path, dpi=200)
return images
except Exception as e:
st.error(f"Conversion error: {e}")
st.info("π‘ Add 'poppler-utils' to packages.txt")
return []
def resize_image(image, max_size=(2048, 2048)):
"""Resize for Claude"""
image.thumbnail(max_size, Image.Resampling.LANCZOS)
return image
def image_to_base64(image):
"""Convert to base64"""
buffered = BytesIO()
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode()
def ocr_with_claude(claude_client, image, context=""):
"""AI OCR - with context from books/exams"""
resized = resize_image(image.copy())
img_b64 = image_to_base64(resized)
prompt = f"""You are transcribing handwritten mathematical solutions written in Italian cursive.
CONTEXT (from textbooks and exams):
{context[:2000] if context else "No context available"}
INSTRUCTIONS:
- Transcribe ALL mathematical notation accurately (symbols, equations, matrices, etc.)
- Preserve the structure and formatting
- If text is in Italian, transcribe it in Italian
- For unclear symbols, use context to infer the most likely interpretation
- Output ONLY the transcription, no explanations
Transcribe this page:"""
try:
message = claude_client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4000,
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": img_b64}},
{"type": "text", "text": prompt}
]
}]
)
return message.content[0].text, message.usage.input_tokens + message.usage.output_tokens
except Exception as e:
st.error(f"OCR error: {e}")
return None, 0
def chunk_text(text, chunk_size=150, overlap=30):
"""Split into chunks"""
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = ' '.join(words[i:i + chunk_size])
if chunk.strip():
chunks.append(chunk)
return chunks
def get_vector_count(qdrant):
"""Get total vectors"""
try:
collection_info = qdrant.get_collection(collection_name=COLLECTION_NAME)
return collection_info.points_count
except:
return 0
# ============================================================================
# INITIALIZE
# ============================================================================
try:
qdrant = get_qdrant_client()
claude = get_claude_client()
st.sidebar.success("β
System Ready")
except Exception as e:
st.error(f"β Init failed: {e}")
st.info("Add secrets: QDRANT_URL, QDRANT_API_KEY, ANTHROPIC_API_KEY, HF_TOKEN")
st.stop()
# ============================================================================
# SIDEBAR
# ============================================================================
st.sidebar.title("π Math AI")
st.sidebar.caption("Production v2.2")
try:
vector_count = get_vector_count(qdrant)
st.sidebar.metric("π Total Vectors", f"{vector_count:,}")
# Get current embedding model
current_model_key = None
current_model_name = st.session_state.embedding_model
for key, value in EMBEDDING_MODELS.items():
if value["name"] == current_model_name:
current_model_key = key
break
if current_model_key:
dimensions = EMBEDDING_MODELS[current_model_key]["dimensions"]
storage_mb = (vector_count * dimensions * 4) / (1024 * 1024)
st.sidebar.metric("πΎ DB Storage", f"{storage_mb:.1f} MB")
except:
st.sidebar.warning("DB unavailable")
st.sidebar.markdown("---")
# Cache management in sidebar
st.sidebar.subheader("πΎ Download Cache")
cached_count = len(list(CACHE_DIR.glob("*.pdf"))) if CACHE_DIR.exists() else 0
cache_size = get_cache_size()
st.sidebar.metric("Cached PDFs", cached_count)
st.sidebar.metric("Cache Size", f"{cache_size:.1f} MB")
if st.sidebar.button("ποΈ Clear PDF Cache"):
clear_cache()
st.sidebar.success("PDF cache cleared!")
st.rerun()
st.sidebar.markdown("---")
# OCR Cache management
st.sidebar.subheader("π€ OCR Cache")
ocr_cached_count, ocr_cost_saved = get_ocr_cache_stats()
ocr_cache_size = get_ocr_cache_size()
st.sidebar.metric("OCR Results", ocr_cached_count)
st.sidebar.metric("π° Cost Saved", f"${ocr_cost_saved:.2f}")
st.sidebar.metric("Cache Size", f"{ocr_cache_size:.2f} MB")
if st.sidebar.button("ποΈ Clear OCR Cache"):
if st.sidebar.checkbox("β οΈ Confirm (will re-OCR on next upload)"):
clear_ocr_cache()
st.sidebar.success("OCR cache cleared!")
st.rerun()
st.sidebar.markdown("---")
# ============================================================================
# TABS
# ============================================================================
tab1, tab2, tab3 = st.tabs(["π Dataset Manager", "π Search & Solve", "π Statistics"])
# ============================================================================
# TAB 1: DATASET MANAGER
# ============================================================================
with tab1:
st.title("π Dataset Manager")
if not os.getenv("HF_TOKEN"):
st.error("β οΈ Add HF_TOKEN in Settings β Secrets")
st.stop()
# Collection setup
st.header("ποΈ Database Setup")
try:
collections = qdrant.get_collections().collections
exists = any(c.name == COLLECTION_NAME for c in collections)
if exists:
st.success(f"β
Collection '{COLLECTION_NAME}' ready")
else:
st.warning("Collection doesn't exist")
selected_model = st.selectbox("Embedding model:", list(EMBEDDING_MODELS.keys()))
if st.button("ποΈ Create Collection"):
dimensions = EMBEDDING_MODELS[selected_model]["dimensions"]
qdrant.create_collection(
collection_name=COLLECTION_NAME,
vectors_config=VectorParams(size=dimensions, distance=Distance.COSINE)
)
st.success("Created!")
st.session_state.embedding_model = EMBEDDING_MODELS[selected_model]["name"]
st.rerun()
except Exception as e:
st.error(f"Error: {e}")
st.markdown("---")
# Processing configuration - ALWAYS VISIBLE
st.header("βοΈ Configuration")
config_col1, config_col2 = st.columns(2)
with config_col1:
st.subheader("Chunking Settings")
chunk_size = st.slider("Chunk size (words):", 50, 500, 150, key="chunk_size_slider")
chunk_overlap = st.slider("Overlap (words):", 0, 100, 30, key="chunk_overlap_slider")
# Show effective chunk size
effective_size = chunk_size - chunk_overlap
st.caption(f"π Effective chunk: {effective_size} words")
with config_col2:
st.subheader("Embedding Model")
# Get current model
current_model_name = st.session_state.embedding_model
current_model_key = None
for key, value in EMBEDDING_MODELS.items():
if value["name"] == current_model_name:
current_model_key = key
break
if not current_model_key:
current_model_key = "MiniLM-L6 (Fast, 384D)"
selected_embedding = st.selectbox(
"Select model:",
list(EMBEDDING_MODELS.keys()),
index=list(EMBEDDING_MODELS.keys()).index(current_model_key),
key="embedding_selector"
)
# Display model info
model_info = EMBEDDING_MODELS[selected_embedding]
st.info(f"""
**Active Model:** {selected_embedding}
- **Dimensions:** {model_info['dimensions']}D
- **Speed:** {model_info['speed']}
- **Quality:** {model_info['quality']}
""")
# Update session state only if different
if st.session_state.embedding_model != model_info['name']:
if st.button("π Apply Model Change"):
st.session_state.embedding_model = model_info['name']
st.success("Model updated! New uploads will use this model.")
st.rerun()
use_context = st.checkbox("Use context for OCR (books + exams)", value=True, key="use_context_checkbox")
st.markdown("---")
# Duplicate Detection Strategy
st.header("π Duplicate Detection Strategy")
strategy_col1, strategy_col2 = st.columns([3, 1])
with strategy_col1:
duplicate_strategy = st.radio(
"How should we check for duplicates?",
[
"π Filename only (fastest, ignores settings changes)",
"βοΈ Filename + Settings (recommended, re-process if chunk size changes)",
"π Filename + Settings + Model (strictest, re-process if anything changes)"
],
index=1, # Default to recommended
help="Choose how strict duplicate detection should be"
)
with strategy_col2:
force_reprocess = st.checkbox(
"π Force re-process",
value=False,
help="Re-process files even if they already exist (OCR will still use cache)"
)
# Map radio selection to strategy code
strategy_map = {
"π Filename only (fastest, ignores settings changes)": "filename_only",
"βοΈ Filename + Settings (recommended, re-process if chunk size changes)": "filename_settings",
"π Filename + Settings + Model (strictest, re-process if anything changes)": "filename_full"
}
selected_strategy = strategy_map[duplicate_strategy]
st.markdown("---")
# Data sources
st.header("π Data Sources")
source_tabs = st.tabs(["π Your Files", "π Public Datasets"])
with source_tabs[0]:
folder_type = st.radio(
"Select folder type:",
["π Books", "π Exams", "ποΈ Answers (OCR)"],
horizontal=True,
key="folder_type_radio"
)
if "Books" in folder_type:
folder_path, doc_type = "books/", "book"
elif "Exams" in folder_type:
folder_path, doc_type = "exams/", "exam"
else:
folder_path, doc_type = "answers/", "answer_handwritten"
col_scan, col_refresh = st.columns([3, 1])
with col_scan:
if st.button(f"π Scan {folder_path}", key="scan_button"):
with st.spinner("Scanning HuggingFace dataset..."):
files = list_dataset_files(folder_path)
if files:
file_status = []
# Get current settings for checking
current_chunk = chunk_size
current_model = st.session_state.embedding_model
for file in files:
name = file.split('/')[-1]
# Check based on selected strategy
if force_reprocess:
processed = False
vector_count_file = 0
else:
processed, vector_count_file = check_if_processed(
qdrant,
name,
chunk_size=current_chunk if selected_strategy != "filename_only" else None,
embedding_model=current_model if selected_strategy == "filename_full" else None,
strategy=selected_strategy
)
# Check if file is cached locally
is_cached = is_file_cached(file)
# Check if OCR is cached (for answer files)
ocr_cached = is_ocr_cached(name) if doc_type == "answer_handwritten" else False
file_status.append({
"file": file,
"name": name,
"processed": processed,
"vectors": vector_count_file,
"cached": is_cached,
"ocr_cached": ocr_cached
})
st.session_state.current_files = file_status
st.session_state.current_folder = folder_path
st.session_state.current_doc_type = doc_type
st.session_state.processing_complete = False
else:
st.warning(f"No PDF files found in {folder_path}")
with col_refresh:
if 'current_files' in st.session_state:
if st.button("π Refresh", key="refresh_button"):
# Re-check all files with current settings
current_chunk = chunk_size
current_model = st.session_state.embedding_model
for file_info in st.session_state.current_files:
if force_reprocess:
file_info['processed'] = False
file_info['vectors'] = 0
else:
processed, vector_count = check_if_processed(
qdrant,
file_info['name'],
chunk_size=current_chunk if selected_strategy != "filename_only" else None,
embedding_model=current_model if selected_strategy == "filename_full" else None,
strategy=selected_strategy
)
file_info['processed'] = processed
file_info['vectors'] = vector_count
file_info['cached'] = is_file_cached(file_info['file'])
file_info['ocr_cached'] = is_ocr_cached(file_info['name']) if st.session_state.current_doc_type == "answer_handwritten" else False
st.rerun()
# Display files if scanned
if 'current_files' in st.session_state and st.session_state.current_folder == folder_path:
processed_count = sum(1 for f in st.session_state.current_files if f['processed'])
pending_count = len(st.session_state.current_files) - processed_count
total_vectors = sum(f['vectors'] for f in st.session_state.current_files)
cached_count_files = sum(1 for f in st.session_state.current_files if f.get('cached', False))
ocr_cached_count_files = sum(1 for f in st.session_state.current_files if f.get('ocr_cached', False))
# Summary metrics
if doc_type == "answer_handwritten":
metric_col1, metric_col2, metric_col3, metric_col4, metric_col5, metric_col6 = st.columns(6)
else:
metric_col1, metric_col2, metric_col3, metric_col4, metric_col5 = st.columns(5)
metric_col6 = None
metric_col1.metric("π Total Files", len(st.session_state.current_files))
metric_col2.metric("β
Processed", processed_count)
metric_col3.metric("β³ Pending", pending_count)
metric_col4.metric("π’ Vectors", f"{total_vectors:,}")
metric_col5.metric("πΎ PDF Cache", cached_count_files)
if doc_type == "answer_handwritten" and metric_col6:
metric_col6.metric("π€ OCR Cache", ocr_cached_count_files)
st.markdown("---")
st.subheader("File Status & Selection")
# File selection with status
selected_files = []
for file_info in st.session_state.current_files:
if doc_type == "answer_handwritten":
col1, col2, col3, col4, col5 = st.columns([3, 1, 1, 1, 1])
else:
col1, col2, col3, col4 = st.columns([3, 1, 1, 1])
col5 = None
with col1:
if file_info['processed'] and not force_reprocess:
checkbox_label = f"β
{file_info['name']}"
is_selected = st.checkbox(
checkbox_label,
value=False,
disabled=True,
key=f"file_{file_info['name']}"
)
else:
checkbox_label = f"β³ {file_info['name']}"
is_selected = st.checkbox(
checkbox_label,
value=True,
key=f"file_{file_info['name']}"
)
if is_selected:
selected_files.append(file_info)
with col2:
if file_info['processed'] and not force_reprocess:
st.caption(f"π’ {file_info['vectors']} vectors")
else:
st.caption("Not uploaded")
with col3:
# PDF Cache status
if file_info.get('cached', False):
st.caption("πΎ PDF")
else:
st.caption("βοΈ PDF")
with col4:
if file_info['processed'] and not force_reprocess:
status_color = "π’"
else:
status_color = "π΄"
st.caption(status_color)
# OCR Cache status (only for answers)
if col5 and doc_type == "answer_handwritten":
with col5:
if file_info.get('ocr_cached', False):
st.caption("π€ OCR β")
else:
st.caption("π€ OCR β")
# Sizing estimation for selected files
if selected_files:
st.markdown("---")
st.subheader("π Processing Preview")
# Download one file to estimate (use cache if available)
sample_file = selected_files[0]
with st.spinner("Calculating estimates..."):
local_path, cache_hit = download_with_cache(sample_file['file'])
if local_path:
est_chunks, est_words = estimate_chunks(local_path, chunk_size, chunk_overlap)
# Calculate totals
total_est_chunks = est_chunks * len(selected_files)
total_est_words = est_words * len(selected_files)
# Get embedding dimensions
current_model_name = st.session_state.embedding_model
dimensions = 384 # default
for key, value in EMBEDDING_MODELS.items():
if value["name"] == current_model_name:
dimensions = value["dimensions"]
break
est_storage_mb = (total_est_chunks * dimensions * 4) / (1024 * 1024)
# Count cache usage
pdf_to_download = sum(1 for f in selected_files if not f.get('cached', False))
pdf_from_cache = len(selected_files) - pdf_to_download
# Display estimates
est_col1, est_col2, est_col3, est_col4 = st.columns(4)
est_col1.metric("π Files", len(selected_files))
est_col2.metric("π Est. Words", f"{total_est_words:,}")
est_col3.metric("βοΈ Est. Chunks", f"{total_est_chunks:,}")
est_col4.metric("πΎ Est. Storage", f"{est_storage_mb:.2f} MB")
# Cache info
cache_col1, cache_col2 = st.columns(2)
cache_col1.metric("βοΈ PDFs to Download", pdf_to_download)
cache_col2.metric("πΎ PDFs from Cache", pdf_from_cache)
if pdf_from_cache > 0:
st.success(f"β¨ {pdf_from_cache} PDF(s) will be loaded from cache (faster!)")
# OCR cost estimation for answers
if doc_type == "answer_handwritten":
ocr_to_run = sum(1 for f in selected_files if not f.get('ocr_cached', False))
ocr_from_cache = len(selected_files) - ocr_to_run
ocr_col1, ocr_col2 = st.columns(2)
ocr_col1.metric("π€ Files need OCR", ocr_to_run)
ocr_col2.metric("π€ OCR from Cache", ocr_from_cache)
if ocr_from_cache > 0:
st.success(f"π° {ocr_from_cache} file(s) have cached OCR (NO COST!)")
if ocr_to_run > 0:
# Estimate ~5 pages per exam, $0.003 per 1K input tokens, ~800 tokens per page
est_pages = ocr_to_run * 5
est_tokens = est_pages * 800
est_cost = est_tokens * 0.000003
st.warning(f"β οΈ **OCR Cost Estimate:** ~${est_cost:.2f} ({est_pages} pages, {est_tokens:,} tokens)")
else:
st.success("π All OCR cached - NO OCR COST!")
st.markdown("---")
# Process button
if st.button("π PROCESS SELECTED FILES", type="primary", key="process_button"):
current_model_name = st.session_state.embedding_model
embedder = get_embedding_model(current_model_name)
# Get context for OCR from books AND exams
context_books = ""
if doc_type == "answer_handwritten" and use_context:
try:
# Get book context
book_samples = qdrant.scroll(
collection_name=COLLECTION_NAME,
limit=10,
with_payload=True,
with_vectors=False,
scroll_filter={"must": [{"key": "source_type", "match": {"value": "book"}}]}
)
# Get exam context
exam_samples = qdrant.scroll(
collection_name=COLLECTION_NAME,
limit=10,
with_payload=True,
with_vectors=False,
scroll_filter={"must": [{"key": "source_type", "match": {"value": "exam"}}]}
)
context_parts = []
if book_samples and book_samples[0]:
context_parts.append("=== FROM TEXTBOOKS ===")
context_parts.append("\n".join([p.payload['content'][:500] for p in book_samples[0][:3]]))
if exam_samples and exam_samples[0]:
context_parts.append("\n=== FROM EXAMS ===")
context_parts.append("\n".join([p.payload['content'][:500] for p in exam_samples[0][:3]]))
context_books = "\n\n".join(context_parts)
except Exception as e:
st.warning(f"Could not load context: {e}")
total_tokens = 0
total_vectors = 0
total_ocr_cost = 0
processing_stats = {}
# Create progress tracking
progress_bar = st.progress(0)
status_text = st.empty()
for idx, file_info in enumerate(selected_files):
# Update progress
progress = (idx) / len(selected_files)
progress_bar.progress(progress)
status_text.text(f"Processing {idx + 1}/{len(selected_files)}: {file_info['name']}")
with st.expander(f"π {file_info['name']}", expanded=True):
try:
# Download PDF (use cache if available)
if file_info.get('cached', False):
st.write("πΎ Loading PDF from cache...")
local_path, cache_hit = download_with_cache(file_info['file'])
if cache_hit:
st.write("β
PDF loaded from cache")
else:
st.write("βοΈ Downloading PDF from HuggingFace...")
local_path, cache_hit = download_with_cache(file_info['file'])
if not cache_hit:
st.write("β
PDF downloaded and cached")
if not local_path:
st.error("β Download failed")
continue
file_start_time = time.time()
tokens_used = 0
if doc_type == "answer_handwritten":
# CHECK OCR CACHE FIRST
if file_info.get('ocr_cached', False):
st.write("π€ Loading OCR from cache...")
text, cached_tokens = load_ocr_from_cache(file_info['name'])
if text:
st.success(f"β
OCR loaded from cache! (Saved ${cached_tokens * 0.000003:.3f})")
tokens_used = 0 # No new tokens used
else:
st.warning("Cache load failed, will run OCR")
file_info['ocr_cached'] = False
# Run OCR if not cached
if not file_info.get('ocr_cached', False):
st.write("πΌοΈ Converting to images...")
images = pdf_to_images(local_path)
if not images:
st.error("β Conversion failed")
continue
st.write(f"β
Converted {len(images)} pages")
transcribed = []
tokens_used = 0
for i, img in enumerate(images, 1):
st.write(f"π€ Running OCR on page {i}/{len(images)}...")
trans, tok = ocr_with_claude(claude, img, context_books)
if trans:
transcribed.append(f"\n=== Page {i} ===\n\n{trans}")
tokens_used += tok
if not transcribed:
st.error("β OCR failed")
continue
text = "\n\n".join(transcribed)
# SAVE OCR TO CACHE
save_ocr_to_cache(file_info['name'], text, tokens_used)
st.success(f"β
OCR complete & cached! {len(text):,} chars (Cost: ${tokens_used * 0.000003:.3f})")
total_tokens += tokens_used
total_ocr_cost += tokens_used * 0.000003
else:
st.write("π Extracting text from PDF...")
text = extract_text_from_pdf(local_path)
if not text:
st.error("β Extraction failed")
continue
st.write(f"β
Extracted {len(text):,} characters")
st.write("βοΈ Chunking text...")
chunks = chunk_text(text, chunk_size, chunk_overlap)
st.write(f"β
Created {len(chunks)} chunks")
st.write("π’ Generating embeddings...")
embeddings = embedder.encode(chunks, show_progress_bar=False)
st.write("πΎ Uploading to vector database...")
points = []
for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
points.append(PointStruct(
id=abs(hash(f"{file_info['file']}_{i}_{time.time()}")) % (2**63),
vector=emb.tolist(),
payload={
"content": chunk,
"source_name": file_info['name'],
"source_type": doc_type,
"chunk_index": i,
"chunk_size": chunk_size,
"embedding_model": current_model_name
}
))
qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
total_vectors += len(points)
file_time = time.time() - file_start_time
st.success(f"β
Uploaded {len(points)} vectors in {file_time:.1f}s!")
# Store stats
processing_stats[file_info['name']] = {
'vectors': len(points),
'chunks': len(chunks),
'time': file_time,
'tokens': tokens_used,
'ocr_cached': file_info.get('ocr_cached', False),
'pdf_cached': cache_hit
}
except Exception as e:
st.error(f"β Error: {e}")
# Complete progress
progress_bar.progress(1.0)
status_text.text(f"β
Completed! Processed {len(selected_files)} files")
# Store results in session state
st.session_state.processing_complete = True
st.session_state.last_processed_files = selected_files
st.session_state.processing_stats = processing_stats
st.balloons()
# Final summary
st.markdown("---")
st.success(f"π **Processing Complete!**")
pdf_cached_loads = sum(1 for s in processing_stats.values() if s.get('pdf_cached', False))
ocr_cached_loads = sum(1 for s in processing_stats.values() if s.get('ocr_cached', False))
summary_col1, summary_col2, summary_col3, summary_col4 = st.columns(4)
summary_col1.metric("π Files", len(selected_files))
summary_col2.metric("π’ Vectors", f"{total_vectors:,}")
summary_col3.metric("πΎ PDF Cache Hits", pdf_cached_loads)
summary_col4.metric("π€ OCR Cache Hits", ocr_cached_loads)
if total_ocr_cost > 0:
st.info(f"π° **Total OCR Cost:** ${total_ocr_cost:.3f}")
elif doc_type == "answer_handwritten":
st.success("π **All OCR from cache - $0.00 cost!**")
# Show persistent results if processing was completed
elif st.session_state.processing_complete and st.session_state.processing_stats:
st.markdown("---")
st.info("βΉοΈ Last processing session completed. Results shown below.")
st.subheader("π Processing Results")
total_vectors = sum(stat['vectors'] for stat in st.session_state.processing_stats.values())
total_tokens = sum(stat['tokens'] for stat in st.session_state.processing_stats.values())
pdf_cached = sum(1 for s in st.session_state.processing_stats.values() if s.get('pdf_cached', False))
ocr_cached = sum(1 for s in st.session_state.processing_stats.values() if s.get('ocr_cached', False))
result_col1, result_col2, result_col3, result_col4 = st.columns(4)
result_col1.metric("π Files", len(st.session_state.processing_stats))
result_col2.metric("π’ Vectors", f"{total_vectors:,}")
result_col3.metric("πΎ PDF Cache", pdf_cached)
result_col4.metric("π€ OCR Cache", ocr_cached)
if total_tokens > 0:
st.info(f"π° **OCR Cost:** ${total_tokens * 0.000003:.3f}")
# Detailed breakdown
with st.expander("π Detailed Breakdown"):
for filename, stats in st.session_state.processing_stats.items():
pdf_status = "πΎ" if stats.get('pdf_cached', False) else "βοΈ"
ocr_status = "π€β" if stats.get('ocr_cached', False) else "π€β"
st.markdown(f"**{filename}** - {pdf_status} PDF | {ocr_status} OCR")
st.caption(f"Vectors: {stats['vectors']:,} | Chunks: {stats['chunks']} | Time: {stats['time']:.1f}s | Tokens: {stats['tokens']:,}")
# Debug info
if 'current_files' in st.session_state:
with st.expander("π§ Debug Info", expanded=False):
st.caption(f"**Folder:** {st.session_state.current_folder}")
st.caption(f"**Doc Type:** {st.session_state.current_doc_type}")
st.caption(f"**Strategy:** {selected_strategy}")
st.caption(f"**Force Reprocess:** {force_reprocess}")
# Show what's in caches
st.caption(f"**OCR Cache Files:** {ocr_cached_count}")
st.caption(f"**PDF Cache Files:** {cached_count}")
with source_tabs[1]:
dataset_choice = st.selectbox(
"Select public dataset:",
["GSM8K - Grade School Math", "MATH - Competition Math", "MathQA - Word Problems"],
key="dataset_selector"
)
sample_size = st.slider("Number of samples:", 10, 2000, 100, key="sample_size_slider")
dataset_name = dataset_choice.split(" - ")[0]
# Check if already loaded
already_loaded, vectors_count = check_if_processed(
qdrant,
dataset_name,
strategy="filename_only"
)
if already_loaded:
st.success(f"β
**{dataset_name}** already loaded with {vectors_count:,} vectors!")
else:
st.info(f"π₯ {dataset_name} not yet loaded")
if st.button(f"π₯ Load {dataset_name}", type="primary", key="load_dataset_button"):
try:
from datasets import load_dataset
current_model_name = st.session_state.embedding_model
embedder = get_embedding_model(current_model_name)
with st.spinner(f"Loading {dataset_name}..."):
if "GSM8K" in dataset_choice:
dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
texts = [f"Problem: {dataset[i]['question']}\n\nSolution: {dataset[i]['answer']}"
for i in range(min(sample_size, len(dataset)))]
elif "MATH" in dataset_choice:
dataset = load_dataset("lighteval/MATH", split="train", trust_remote_code=True)
texts = [f"Problem: {dataset[i].get('problem', '')}\n\nSolution: {dataset[i].get('solution', '')}"
for i in range(min(sample_size, len(dataset)))]
else:
dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
texts = [f"Problem: {dataset[i]['Problem']}\n\nAnswer: {dataset[i]['correct']}"
for i in range(min(sample_size, len(dataset)))]
st.write(f"β
Loaded {len(texts)} problems")
st.write("π’ Generating embeddings...")
embeddings = embedder.encode(texts, show_progress_bar=True)
st.write("πΎ Uploading to vector database...")
points = []
for i, (text, emb) in enumerate(zip(texts, embeddings)):
points.append(PointStruct(
id=abs(hash(f"{dataset_name}_{i}_{time.time()}")) % (2**63),
vector=emb.tolist(),
payload={
"content": text[:2000],
"source_name": dataset_name,
"source_type": "public_dataset",
"index": i,
"chunk_size": "N/A",
"embedding_model": current_model_name
}
))
qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
st.success(f"β
Uploaded {len(points)} vectors!")
st.balloons()
except Exception as e:
st.error(f"β Error: {e}")
st.info("π‘ Make sure 'datasets' is in your requirements.txt")
# ============================================================================
# TAB 2: SEARCH & SOLVE
# ============================================================================
with tab2:
st.title("π Search & Solve")
problem = st.text_area(
"Enter your math problem:",
placeholder="Find gradient of L(w) = (1/2)||Xw - y||Β²",
height=150,
key="problem_input"
)
col1, col2 = st.columns(2)
col1.slider("Retrieve top K:", 3, 20, 5, key="top_k")
col2.select_slider("Detail level:", ["Concise", "Standard", "Detailed", "Exhaustive"], value="Detailed", key="detail")
if st.button("π SOLVE", type="primary", key="solve_button") and problem:
current_model_name = st.session_state.embedding_model
embedder = get_embedding_model(current_model_name)
with st.spinner("Searching knowledge base..."):
query_emb = embedder.encode(problem)
try:
results = qdrant.search(
collection_name=COLLECTION_NAME,
query_vector=query_emb.tolist(),
limit=st.session_state.top_k
)
except:
results = []
if not results:
st.warning("β οΈ No results found. Please load data in Dataset Manager first.")
else:
st.success(f"β
Found {len(results)} relevant references!")
with st.expander("π Retrieved References", expanded=False):
for i, r in enumerate(results, 1):
st.markdown(f"**Reference {i}** (Relevance: {r.score*100:.1f}%)")
st.text(r.payload['content'][:300] + "...")
st.caption(f"π Source: {r.payload.get('source_name')} | Type: {r.payload.get('source_type')}")
st.markdown("---")
with st.spinner("Generating solution with Claude..."):
context = "\n\n".join([r.payload['content'] for r in results])
prompt = f"""Solve the following math problem using the provided references.
PROBLEM:
{problem}
REFERENCES:
{context}
DETAIL LEVEL: {st.session_state.detail}
Please provide your response in the following format:
## SOLUTION
[Step-by-step solution]
## REASONING
[Explain why you solved it this way]
## REFERENCES
[Cite which sources you used]"""
try:
message = claude.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4000,
messages=[{"role": "user", "content": prompt}]
)
st.markdown("---")
st.markdown("## π Solution")
st.markdown(message.content[0].text)
st.download_button(
"π₯ Download Solution",
message.content[0].text,
file_name=f"solution_{int(time.time())}.md",
mime="text/markdown"
)
except Exception as e:
st.error(f"β Error generating solution: {e}")
# ============================================================================
# TAB 3: STATISTICS
# ============================================================================
with tab3:
st.title("π Database Statistics")
try:
# Get sample of all data
sample = qdrant.scroll(
collection_name=COLLECTION_NAME,
limit=1000,
with_payload=True,
with_vectors=False
)
if sample and sample[0]:
types = {}
sources = set()
source_vectors = {}
chunk_sizes = {}
models = {}
for point in sample[0]:
src_type = point.payload.get('source_type', 'unknown')
src_name = point.payload.get('source_name', 'Unknown')
chunk_size_val = point.payload.get('chunk_size', 'N/A')
model_val = point.payload.get('embedding_model', 'N/A')
types[src_type] = types.get(src_type, 0) + 1
sources.add(src_name)
source_vectors[src_name] = source_vectors.get(src_name, 0) + 1
if chunk_size_val != 'N/A':
chunk_sizes[chunk_size_val] = chunk_sizes.get(chunk_size_val, 0) + 1
if model_val != 'N/A':
model_short = model_val.split('/')[-1][:30]
models[model_short] = models.get(model_short, 0) + 1
# Overall metrics
total_vectors = get_vector_count(qdrant)
col1, col2, col3 = st.columns(3)
col1.metric("π Total Vectors", f"{total_vectors:,}")
col2.metric("π Unique Sources", len(sources))
col3.metric("π Document Types", len(types))
st.markdown("---")
# Distribution by type
st.subheader("π Distribution by Document Type")
for doc_type, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
pct = count / sum(types.values()) * 100
st.progress(count / sum(types.values()), text=f"{doc_type}: {count:,} vectors ({pct:.1f}%)")
st.markdown("---")
# Chunk sizes used
if chunk_sizes:
st.subheader("βοΈ Chunk Sizes Used")
for size, count in sorted(chunk_sizes.items()):
pct = count / sum(chunk_sizes.values()) * 100
st.caption(f"β’ **{size} words**: {count:,} vectors ({pct:.1f}%)")
# Models used
if models:
st.subheader("π€ Embedding Models Used")
for model, count in sorted(models.items(), key=lambda x: x[1], reverse=True):
pct = count / sum(models.values()) * 100
st.caption(f"β’ **{model}**: {count:,} vectors ({pct:.1f}%)")
st.markdown("---")
# All sources
st.subheader("π All Data Sources")
for src in sorted(sources):
vector_count = source_vectors.get(src, 0)
st.caption(f"β’ **{src}** - {vector_count:,} vectors")
else:
st.info("π No data in database yet. Upload some files in the Dataset Manager!")
except Exception as e:
st.error(f"β Error loading statistics: {e}")
st.sidebar.caption("Powered by Claude AI β’ v2.2") |