Spaces:

Hebaelsayed
/

math-ai-system

Sleeping

App Files Files Community

math-ai-system / src /streamlit_app.py

Hebaelsayed

Update src/streamlit_app.py

f41b68e verified 25 days ago

raw

history blame contribute delete

28.6 kB

	import streamlit as st
	import os
	import time
	from qdrant_client import QdrantClient
	from qdrant_client.models import Distance, VectorParams, PointStruct
	from sentence_transformers import SentenceTransformer
	import PyPDF2
	import io

	# ============================================================================
	# CONFIGURATION
	# ============================================================================

	st.set_page_config(
	page_title="Math AI - Phase 2.5: Database + PDF",
	page_icon="🗄️",
	layout="wide"
	)

	COLLECTION_NAME = "math_knowledge_base"

	# ============================================================================
	# CACHED FUNCTIONS
	# ============================================================================

	@st.cache_resource(show_spinner="🔌 Connecting to Qdrant...")
	def get_qdrant_client():
	qdrant_url = os.getenv("QDRANT_URL")
	qdrant_api_key = os.getenv("QDRANT_API_KEY")

	if not qdrant_url or not qdrant_api_key:
	return None

	return QdrantClient(url=qdrant_url, api_key=qdrant_api_key)

	@st.cache_resource(show_spinner="🤖 Loading embedding model (30-60s first time)...")
	def get_embedding_model():
	try:
	model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
	return model
	except Exception as e:
	st.error(f"Failed to load model: {e}")
	return None

	def get_vector_count_reliable(client, collection_name):
	try:
	count = 0
	offset = None
	max_iterations = 1000

	for _ in range(max_iterations):
	result = client.scroll(
	collection_name=collection_name,
	limit=100,
	offset=offset,
	with_payload=False,
	with_vectors=False
	)

	if result is None or result[0] is None or len(result[0]) == 0:
	break

	count += len(result[0])
	offset = result[1]

	if offset is None:
	break

	return count
	except:
	return 0

	def check_collection_exists(client, collection_name):
	try:
	collections = client.get_collections().collections
	return any(c.name == collection_name for c in collections)
	except:
	return False

	def extract_text_from_pdf(pdf_file):
	"""Extract text from PDF file"""
	try:
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	text = ""

	for page_num, page in enumerate(pdf_reader.pages):
	page_text = page.extract_text()
	text += f"\n\n--- Page {page_num + 1} ---\n\n{page_text}"

	return text
	except Exception as e:
	st.error(f"PDF extraction error: {str(e)}")
	return None

	# ============================================================================
	# SESSION STATE
	# ============================================================================

	if 'db_created' not in st.session_state:
	st.session_state.db_created = False

	if 'embedder_ready' not in st.session_state:
	st.session_state.embedder_ready = False

	if 'show_step' not in st.session_state:
	st.session_state.show_step = 'all'

	# ============================================================================
	# MAIN APP
	# ============================================================================

	st.title("🗄️ Phase 2.5: Database Setup + PDF Upload")

	client = get_qdrant_client()
	embedder = get_embedding_model()

	# ============================================================================
	# SIDEBAR
	# ============================================================================

	with st.sidebar:
	st.header("⚡ Quick Navigation")

	if st.button("📋 Show All Steps", use_container_width=True):
	st.session_state.show_step = 'all'

	if st.button("🚀 Skip to Upload", use_container_width=True):
	st.session_state.show_step = 'upload'

	if st.button("🔍 Skip to Search", use_container_width=True):
	st.session_state.show_step = 'search'

	st.markdown("---")
	st.subheader("📊 System Status")

	if client and check_collection_exists(client, COLLECTION_NAME):
	st.success("✅ Database Ready")
	st.session_state.db_created = True
	else:
	st.warning("⚠️ Database Not Ready")

	if embedder:
	st.success("✅ Model Loaded")
	st.session_state.embedder_ready = True
	else:
	st.warning("⚠️ Model Not Loaded")

	if client and st.session_state.db_created:
	count = get_vector_count_reliable(client, COLLECTION_NAME)
	st.metric("Vectors in DB", f"{count:,}")

	show_all = st.session_state.show_step == 'all'
	show_upload = st.session_state.show_step in ['all', 'upload']
	show_search = st.session_state.show_step in ['all', 'search']

	# ============================================================================
	# STEP 1-2: Quick Status
	# ============================================================================

	if show_all:
	st.header("Step 1-2: System Check")

	col1, col2, col3 = st.columns(3)

	with col1:
	st.metric("Claude API", "✅" if os.getenv("ANTHROPIC_API_KEY") else "❌")

	with col2:
	st.metric("Qdrant", "✅ Connected" if client else "❌")

	with col3:
	st.metric("Embedder", "✅ Cached" if embedder else "❌")

	if not client:
	st.error("⚠️ Check Qdrant secrets!")
	st.stop()

	st.markdown("---")

	# ============================================================================
	# STEP 3: Collection Management
	# ============================================================================

	if show_all:
	st.header("🏗️ Step 3: Database Collection")

	if st.session_state.db_created:
	st.success(f"✅ Collection '{COLLECTION_NAME}' ready!")

	col1, col2 = st.columns(2)
	with col1:
	if st.button("🔄 Recreate Collection"):
	try:
	client.delete_collection(COLLECTION_NAME)
	st.session_state.db_created = False
	st.rerun()
	except Exception as e:
	st.error(f"Error: {e}")

	with col2:
	if st.button("ℹ️ Collection Info"):
	count = get_vector_count_reliable(client, COLLECTION_NAME)
	st.json({"name": COLLECTION_NAME, "vectors": count, "status": "Ready"})

	else:
	if st.button("🏗️ CREATE COLLECTION", type="primary"):
	try:
	client.create_collection(
	collection_name=COLLECTION_NAME,
	vectors_config=VectorParams(size=384, distance=Distance.COSINE)
	)
	st.success(f"🎉 Created: {COLLECTION_NAME}")
	st.session_state.db_created = True
	st.rerun()
	except Exception as e:
	st.error(f"❌ Failed: {str(e)}")

	st.markdown("---")

	# ============================================================================
	# STEP 4: Embedding Model
	# ============================================================================

	if show_all:
	st.header("🤖 Step 4: Embedding Model")

	if embedder:
	st.success("✅ Model loaded and cached!")
	st.session_state.embedder_ready = True
	else:
	st.warning("⚠️ Model loading failed. Refresh page.")

	st.markdown("---")

	# ============================================================================
	# STEP 5A: Upload Custom Text
	# ============================================================================

	if show_upload:
	st.header("📝 Step 5A: Upload Custom Notes")

	if not st.session_state.db_created or not st.session_state.embedder_ready:
	st.error("⚠️ Complete Steps 3 & 4 first")
	else:

	# Choose upload method
	upload_method = st.radio(
	"Upload method:",
	["📝 Paste Text", "📄 Upload PDF File"],
	horizontal=True
	)

	if upload_method == "📝 Paste Text":
	with st.expander("✍️ Paste text", expanded=True):

	custom_text = st.text_area(
	"Math notes:",
	value="""Linear Equations: ax + b = 0, solution is x = -b/a

	Quadratic Equations: ax² + bx + c = 0
	Solution: x = (-b ± √(b²-4ac)) / 2a

	Pythagorean Theorem: a² + b² = c²

	Derivatives:
	d/dx(xⁿ) = nxⁿ⁻¹
	d/dx(sin x) = cos x""",
	height=200
	)

	source_name = st.text_input("Source name:", value="math_notes.txt")

	if st.button("🚀 UPLOAD TEXT", type="primary"):

	if not custom_text.strip():
	st.error("Please enter text!")
	else:
	try:
	progress = st.progress(0)
	status = st.empty()

	status.text("📄 Chunking text...")
	progress.progress(0.2)

	words = custom_text.split()
	chunks = []
	chunk_size = 50

	for i in range(0, len(words), 40):
	chunk = ' '.join(words[i:i + chunk_size])
	if chunk.strip():
	chunks.append(chunk)

	st.write(f"✅ Created {len(chunks)} chunks")

	status.text("🔢 Generating embeddings...")
	progress.progress(0.5)

	embeddings = embedder.encode(chunks, show_progress_bar=False)
	st.write(f"✅ Generated {len(embeddings)} embeddings")

	status.text("☁️ Uploading...")
	progress.progress(0.8)

	points = []
	for idx, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
	points.append(PointStruct(
	id=abs(hash(f"{source_name}_{idx}_{custom_text[:50]}_{time.time()}")) % (2**63),
	vector=embedding.tolist(),
	payload={
	"content": chunk,
	"source_name": source_name,
	"source_type": "custom_notes",
	"chunk_index": idx
	}
	))

	client.upsert(collection_name=COLLECTION_NAME, points=points)

	progress.progress(1.0)
	status.empty()

	st.success(f"🎉 Uploaded {len(points)} vectors!")

	count = get_vector_count_reliable(client, COLLECTION_NAME)
	st.info(f"📊 Total vectors: {count:,}")

	except Exception as e:
	st.error(f"❌ Failed: {str(e)}")
	st.exception(e)

	else: # PDF Upload
	with st.expander("📄 Upload PDF", expanded=True):

	st.info("🎉 NEW! Upload your math PDFs directly")

	uploaded_file = st.file_uploader(
	"Choose PDF file:",
	type=['pdf'],
	help="Upload a PDF with math content"
	)

	if uploaded_file:
	st.write(f"📄 File: {uploaded_file.name} ({uploaded_file.size / 1024:.1f} KB)")

	source_name = st.text_input(
	"Source name:",
	value=uploaded_file.name.replace('.pdf', '')
	)

	if st.button("🚀 UPLOAD PDF", type="primary"):

	try:
	progress = st.progress(0)
	status = st.empty()

	# Extract text
	status.text("📖 Extracting text from PDF...")
	progress.progress(0.1)

	extracted_text = extract_text_from_pdf(uploaded_file)

	if not extracted_text:
	st.error("❌ Failed to extract text from PDF")
	st.stop()

	st.write(f"✅ Extracted {len(extracted_text)} characters")

	# Show preview
	with st.expander("👁️ Preview extracted text"):
	st.text(extracted_text[:500] + "..." if len(extracted_text) > 500 else extracted_text)

	# Chunk
	status.text("📄 Chunking text...")
	progress.progress(0.3)

	words = extracted_text.split()
	chunks = []
	chunk_size = 100 # Larger chunks for PDFs
	overlap = 20

	for i in range(0, len(words), chunk_size - overlap):
	chunk = ' '.join(words[i:i + chunk_size])
	if chunk.strip():
	chunks.append(chunk)

	st.write(f"✅ Created {len(chunks)} chunks")

	# Embed
	status.text("🔢 Generating embeddings...")
	progress.progress(0.5)

	embeddings = []
	for idx, chunk in enumerate(chunks):
	embedding = embedder.encode(chunk)
	embeddings.append(embedding)

	if idx % 20 == 0:
	progress.progress(0.5 + (0.3 * idx / len(chunks)))

	st.write(f"✅ Generated {len(embeddings)} embeddings")

	# Upload
	status.text("☁️ Uploading to database...")
	progress.progress(0.9)

	points = []
	for idx, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
	points.append(PointStruct(
	id=abs(hash(f"pdf_{source_name}_{idx}_{time.time()}")) % (2**63),
	vector=embedding.tolist(),
	payload={
	"content": chunk,
	"source_name": source_name,
	"source_type": "pdf_upload",
	"chunk_index": idx,
	"file_name": uploaded_file.name
	}
	))

	client.upsert(collection_name=COLLECTION_NAME, points=points)

	progress.progress(1.0)
	status.empty()

	st.success(f"🎉 Uploaded {len(points)} vectors from PDF!")
	st.balloons()

	count = get_vector_count_reliable(client, COLLECTION_NAME)
	st.info(f"📊 Total vectors: {count:,}")

	except Exception as e:
	st.error(f"❌ Upload failed: {str(e)}")
	st.exception(e)

	st.markdown("---")

	# ============================================================================
	# STEP 5B: Load Public Datasets (FIXED - No DeepMind)
	# ============================================================================

	if show_upload:
	st.header("📚 Step 5B: Load Public Datasets")

	if not st.session_state.db_created or not st.session_state.embedder_ready:
	st.error("⚠️ Complete Steps 3 & 4 first")
	else:
	with st.expander("📊 Load from Hugging Face", expanded=False):

	dataset_choice = st.selectbox(
	"Dataset:",
	[
	"GSM8K - Grade School Math (8.5K problems)",
	"MATH - Competition Math (12.5K problems) ✨",
	"MathQA - Math Word Problems (37K problems) 🆕",
	"CAMEL-AI Math - GPT-4 Generated (50K problems)",
	"RACE - Reading Comprehension (28K passages)"
	]
	)

	# INCREASED LIMIT FROM 500 TO 2000!
	sample_size = st.slider("Items to load:", 10, 2000, 50)

	st.warning(f"⚠️ Loading {sample_size} items. Large numbers take 5-15 minutes!")

	if st.button("📥 LOAD DATASET", type="primary"):

	try:
	from datasets import load_dataset

	progress = st.progress(0)
	status = st.empty()

	# GSM8K
	if "GSM8K" in dataset_choice:
	status.text("📥 Downloading GSM8K...")
	progress.progress(0.1)

	dataset = load_dataset("openai/gsm8k", "main", split="train", trust_remote_code=True)
	dataset_name = "GSM8K"

	texts = []
	for i in range(min(sample_size, len(dataset))):
	item = dataset[i]
	text = f"Problem: {item['question']}\n\nSolution: {item['answer']}"
	texts.append(text)

	# MATH
	elif "MATH" in dataset_choice and "Competition" in dataset_choice:
	status.text("📥 Downloading MATH...")
	progress.progress(0.1)

	dataset = None
	dataset_name = "MATH"

	# Try multiple sources
	for source in ["lighteval/MATH", "DigitalLearningGmbH/MATH-lighteval", "EleutherAI/hendrycks_math"]:
	try:
	dataset = load_dataset(source, split="train", trust_remote_code=True)
	st.success(f"✅ Using {source}")
	break
	except:
	continue

	if dataset is None:
	st.error("❌ All MATH sources failed")
	st.stop()

	texts = []
	for i in range(min(sample_size, len(dataset))):
	item = dataset[i]
	problem = item.get('problem', item.get('question', ''))
	solution = item.get('solution', item.get('answer', ''))
	problem_type = item.get('type', item.get('level', 'general'))
	text = f"Problem ({problem_type}): {problem}\n\nSolution: {solution}"
	texts.append(text)

	# MathQA (REPLACES DEEPMIND)
	elif "MathQA" in dataset_choice:
	status.text("📥 Downloading MathQA...")
	progress.progress(0.1)

	st.info("🆕 MathQA: 37K math word problems with detailed solutions")

	dataset = load_dataset("allenai/math_qa", split="train", trust_remote_code=True)
	dataset_name = "MathQA"

	texts = []
	for i in range(min(sample_size, len(dataset))):
	item = dataset[i]
	text = f"Problem: {item['Problem']}\n\nRationale: {item['Rationale']}\n\nAnswer: {item['correct']}"
	texts.append(text)

	# CAMEL-AI
	elif "CAMEL" in dataset_choice:
	status.text("📥 Downloading CAMEL-AI...")
	progress.progress(0.1)

	dataset = load_dataset("camel-ai/math", split="train", trust_remote_code=True)
	dataset_name = "CAMEL-Math"

	texts = []
	for i in range(min(sample_size, len(dataset))):
	item = dataset[i]
	text = f"Problem: {item['message']}"
	texts.append(text)

	# RACE
	else:
	status.text("📥 Downloading RACE...")
	progress.progress(0.1)

	dataset = load_dataset("ehovy/race", "all", split="train", trust_remote_code=True)
	dataset_name = "RACE"

	texts = []
	for i in range(min(sample_size, len(dataset))):
	item = dataset[i]
	text = f"Article: {item['article'][:500]}\n\nQuestion: {item['question']}\n\nAnswer: {item['answer']}"
	texts.append(text)

	# Common processing
	st.write(f"✅ Loaded {len(texts)} items from {dataset_name}")
	progress.progress(0.3)

	status.text("🔢 Generating embeddings...")
	embeddings = []

	for idx, text in enumerate(texts):
	embedding = embedder.encode(text)
	embeddings.append(embedding)

	if idx % 50 == 0:
	progress.progress(0.3 + (0.5 * idx / len(texts)))
	status.text(f"🔢 Embedding {idx+1}/{len(texts)}")

	st.write(f"✅ Generated {len(embeddings)} embeddings")
	progress.progress(0.8)

	status.text("☁️ Uploading...")

	points = []
	for idx, (text, embedding) in enumerate(zip(texts, embeddings)):
	content = text[:2000] if len(text) > 2000 else text

	points.append(PointStruct(
	id=abs(hash(f"{dataset_name}_{idx}_{time.time()}")) % (2**63),
	vector=embedding.tolist(),
	payload={
	"content": content,
	"source_name": dataset_name,
	"source_type": "public_dataset",
	"dataset": dataset_name,
	"index": idx
	}
	))

	client.upsert(collection_name=COLLECTION_NAME, points=points)

	progress.progress(1.0)
	status.empty()

	st.success(f"🎉 Uploaded {len(points)} vectors from {dataset_name}!")

	count = get_vector_count_reliable(client, COLLECTION_NAME)
	st.info(f"📊 Total vectors: {count:,}")

	except ImportError:
	st.error("❌ Add 'datasets' to requirements.txt")
	except Exception as e:
	st.error(f"❌ Failed: {str(e)}")
	st.exception(e)

	st.markdown("---")

	# ============================================================================
	# STEP 6: Search
	# ============================================================================

	if show_search:
	st.header("🔍 Step 6: Test Search")

	if not st.session_state.db_created or not st.session_state.embedder_ready:
	st.error("⚠️ Database and embedder must be ready")
	else:
	search_query = st.text_input(
	"Question:",
	placeholder="Solve x² + 5x - 4 = 0"
	)

	col1, col2 = st.columns([3, 1])
	with col1:
	top_k = st.slider("Results:", 1, 10, 5)

	with col2:
	st.metric("DB Vectors", get_vector_count_reliable(client, COLLECTION_NAME))

	if st.button("🔍 SEARCH", type="primary") and search_query:

	try:
	with st.spinner("Searching..."):

	query_embedding = embedder.encode(search_query)

	results = client.search(
	collection_name=COLLECTION_NAME,
	query_vector=query_embedding.tolist(),
	limit=top_k
	)

	if results:
	st.success(f"✅ Found {len(results)} results!")

	for i, result in enumerate(results, 1):
	similarity_pct = result.score * 100

	if similarity_pct > 50:
	color = "🟢"
	elif similarity_pct > 30:
	color = "🟡"
	else:
	color = "🔴"

	with st.expander(f"{color} Result {i} - {similarity_pct:.1f}% match", expanded=(i<=2)):
	st.info(result.payload['content'])

	col1, col2, col3 = st.columns(3)
	with col1:
	st.caption(f"Source: {result.payload['source_name']}")
	with col2:
	st.caption(f"Type: {result.payload['source_type']}")
	with col3:
	st.caption(f"Score: {result.score:.4f}")
	else:
	st.warning("No results found!")

	except Exception as e:
	st.error(f"❌ Search failed: {str(e)}")

	st.markdown("---")
	st.success("🎉 Phase 2.5 Complete! You now have: Text, PDF upload, and 4 working datasets!")