Spaces:

NeerajCodz
/

aiMathQuestionClassification

Sleeping

App Files Files Community

aiMathQuestionClassification / app.py

NeerajCodz

Fresh start: Push all project files including models and notebooks

1d5f27f 24 days ago

raw

history blame

22.4 kB

	import warnings
	warnings.filterwarnings('ignore', category=FutureWarning)
	warnings.filterwarnings('ignore', category=UserWarning)
	import gradio as gr
	import pickle
	import numpy as np
	import re
	import os
	from google import genai
	from pathlib import Path
	from typing import Dict, Tuple
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from scipy.sparse import hstack
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	# Configure Gemini
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
	if GEMINI_API_KEY:
	client = genai.Client(api_key=GEMINI_API_KEY)
	model_name = 'gemini-1.5-flash'
	else:
	client = None
	model_name = None
	print("WARNING: GEMINI_API_KEY not found in environment variables")

	# Download NLTK data if not present
	import nltk
	try:
	nltk.data.find('corpora/stopwords')
	except LookupError:
	nltk.download('stopwords', quiet=True)
	try:
	nltk.data.find('corpora/wordnet')
	except LookupError:
	nltk.download('wordnet', quiet=True)


	class MathFeatureExtractor:
	"""Extract features from math problems"""

	def __init__(self):
	self.lemmatizer = WordNetLemmatizer()
	self.stop_words = set(stopwords.words('english'))

	def clean_latex(self, text: str) -> str:
	"""Remove or simplify LaTeX commands"""
	text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text)
	text = re.sub(r'\\[a-zA-Z]+', ' ', text)
	text = re.sub(r'[\{\}\$\\]', ' ', text)
	return text

	def extract_math_symbols(self, text: str) -> Dict[str, int]:
	"""Extract mathematical symbols as binary features"""
	symbols = {
	'has_fraction': int('frac' in text or '/' in text),
	'has_sqrt': int('sqrt' in text or '√' in text),
	'has_exponent': int('^' in text or 'pow' in text),
	'has_integral': int('int' in text or '∫' in text),
	'has_derivative': int("'" in text or 'prime' in text),
	'has_summation': int('sum' in text or '∑' in text),
	'has_pi': int('pi' in text or 'π' in text),
	'has_trigonometric': int(any(t in text.lower() for t in ['sin', 'cos', 'tan'])),
	'has_inequality': int(any(s in text for s in ['<', '>', 'leq', 'geq', '≤', '≥'])),
	'has_absolute': int('abs' in text or '\|' in text),
	}
	return symbols

	def extract_numeric_features(self, text: str) -> Dict[str, float]:
	"""Extract numeric features from text"""
	numbers = re.findall(r'-?\d+\.?\d*', text)
	return {
	'num_count': len(numbers),
	'has_large_numbers': int(any(float(n) > 100 for n in numbers if n)),
	'has_decimals': int(any('.' in n for n in numbers)),
	'has_negatives': int(any(n.startswith('-') for n in numbers)),
	'avg_number': np.mean([float(n) for n in numbers]) if numbers else 0,
	}

	def preprocess_text(self, text: str) -> str:
	"""Clean and preprocess text"""
	text = self.clean_latex(text)
	text = text.lower()
	text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
	words = text.split()
	words = [self.lemmatizer.lemmatize(w) for w in words
	if w not in self.stop_words and len(w) > 2]
	return ' '.join(words)


	# Load the trained model
	def load_model(model_path: str = "model.pkl"):
	"""Load the trained model and components"""
	with open(model_path, 'rb') as f:
	model_data = pickle.load(f)
	return model_data


	# Initialize
	feature_extractor = MathFeatureExtractor()
	model_data = load_model()
	model = model_data['model']
	vectorizer = model_data['vectorizer']
	scaler = model_data['scaler']
	label_encoder = model_data['label_encoder']


	def extract_features(question: str) -> np.ndarray:
	"""Extract features from a question"""
	# Preprocess text
	processed_text = feature_extractor.preprocess_text(question)

	# Extract mathematical and numeric features
	math_symbols = feature_extractor.extract_math_symbols(question)
	numeric_features = feature_extractor.extract_numeric_features(question)

	# Combine additional features
	additional_features = np.array(list(math_symbols.values()) + list(numeric_features.values())).reshape(1, -1)

	# Vectorize text
	X_text = vectorizer.transform([processed_text])

	# Scale additional features
	X_additional_scaled = scaler.transform(additional_features)

	# Combine all features
	X = hstack([X_text, X_additional_scaled])

	return X


	def get_gemini_solution(question: str, image_path: str = None) -> str:
	"""Get solution from Gemini API"""
	if not client or not model_name:
	return "Gemini API key not configured. Please set GEMINI_API_KEY in your .env file."

	try:
	if image_path:
	# Load and process image
	from PIL import Image
	img = Image.open(image_path)
	prompt = "Solve this math problem step-by-step with clear explanations."

	response = client.models.generate_content(
	model=model_name,
	contents=[prompt, img]
	)
	else:
	prompt = f"Solve this math problem step-by-step: {question}"

	response = client.models.generate_content(
	model=model_name,
	contents=prompt
	)

	return response.text
	except Exception as e:
	error_msg = str(e).lower()
	if '429' in error_msg or 'quota' in error_msg or 'rate limit' in error_msg:
	return "ERROR: Gemini API rate limit exceeded. Please try again later."
	elif '404' in error_msg or 'not found' in error_msg:
	return "ERROR: Gemini API model not available."
	else:
	return "ERROR: Unable to get solution from Gemini API."


	def predict_and_solve(question: str, image) -> Tuple[str, str]:
	"""Predict topic and get solution"""
	if not question.strip() and image is None:
	return "Please enter a math question or upload an image.", ""

	# If image is provided, use OCR or direct analysis
	image_path = None
	if image is not None:
	image_path = image
	# For image input, we'll let Gemini handle the text extraction
	# Skip classification for now and go straight to solution
	solution = get_gemini_solution("", image_path)

	solution_html = "<div style='font-family: Arial, sans-serif; line-height: 1.8;'>"
	solution_html += "<h2 style='color: #2c3e50; margin: 20px 0;'>AI Solution from Image</h2>"
	solution_html += "<div style='background-color: #f8f9fa; padding: 20px; border-radius: 10px; border-left: 4px solid #3498db;'>"
	solution_html += solution.replace('\n', '<br>')
	solution_html += "</div></div>"

	return "<div style='font-family: Arial, sans-serif; background-color: #1a1a1a; padding: 25px; border-radius: 12px;'><h2 style='color: #ffffff;'>Image Analysis</h2><p style='color: #ffffff;'>Processing image input...</p></div>", solution_html

	# Extract features and predict
	X = extract_features(question)

	# Get probabilities
	if hasattr(model, 'predict_proba'):
	probabilities = model.predict_proba(X)[0]

	# Sort by probability
	sorted_indices = np.argsort(probabilities)[::-1]

	# Create probability display
	prob_html = "<div style='font-family: Arial, sans-serif; background-color: #1a1a1a; padding: 25px; border-radius: 12px;'>"
	prob_html += "<h2 style='color: #ffffff; margin-bottom: 20px;'>Topic Classification</h2>"

	for idx in sorted_indices:
	topic = label_encoder.classes_[idx]
	prob = probabilities[idx] * 100

	if prob < 1: # Skip very low probabilities
	continue

	# Color based on probability
	if prob >= 50:
	color = "#27ae60" # Green
	elif prob >= 30:
	color = "#f39c12" # Orange
	else:
	color = "#95a5a6" # Gray

	prob_html += f"""
	<div style='margin: 15px 0;'>
	<div style='display: flex; justify-content: space-between; margin-bottom: 5px;'>
	<span style='font-weight: bold; color: #ffffff; text-transform: capitalize;'>{topic}</span>
	<span style='font-weight: bold; color: {color};'>{prob:.1f}%</span>
	</div>
	<div style='background-color: #2d2d2d; border-radius: 10px; height: 25px; overflow: hidden;'>
	<div style='background-color: {color}; height: 100%; width: {prob}%; transition: width 0.3s ease;'></div>
	</div>
	</div>
	"""

	prob_html += "</div>"
	else:
	prediction = model.predict(X)[0]
	topic = label_encoder.inverse_transform([prediction])[0]
	prob_html = f"<h2>Predicted Topic: {topic}</h2>"

	# Get solution from Gemini
	solution = get_gemini_solution(question)

	# Format solution with proper HTML
	solution_html = "<div style='font-family: Arial, sans-serif; line-height: 1.8;'>"
	solution_html += "<h2 style='color: #ffffff; margin: 20px 0;'>AI Solution</h2>"
	solution_html += "<div style='background-color: #1a1a1a; color: #ffffff; padding: 20px; border-radius: 10px; border-left: 4px solid #3498db;'>"
	solution_html += solution.replace('\n', '<br>')
	solution_html += "</div></div>"

	return prob_html, solution_html


	def create_docs_content():
	"""Create documentation content"""
	docs_html = """
	<div style='font-family: Arial, sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px;'>
	<h1 style='color: #ffffff; border-bottom: 3px solid #ffffff; padding-bottom: 10px;'>📚 AI Math Question Classification - Documentation</h1>

	<h2 style='color: #3498db; margin-top: 30px;'>🎯 Project Overview</h2>
	<p style='line-height: 1.8; color: #555;'>
	This project implements an intelligent mathematical question classification system that automatically categorizes
	math problems into their respective topics (Algebra, Calculus, Geometry, etc.) using machine learning techniques.
	</p>

	<h2 style='color: #3498db; margin-top: 30px;'>📊 Dataset</h2>
	<ul style='line-height: 2; color: #555;'>
	<li><strong>Source:</strong> MATH Dataset - A collection of mathematical competition problems</li>
	<li><strong>Training Samples:</strong> 7,500 problems</li>
	<li><strong>Test Samples:</strong> 5,000 problems</li>
	<li><strong>Topics:</strong> 7 categories (Algebra, Calculus, Geometry, Number Theory, Precalculus, Probability, Intermediate Algebra)</li>
	<li><strong>Format:</strong> JSON files converted to Parquet for efficient processing</li>
	</ul>

	<h2 style='color: #3498db; margin-top: 30px;'>🔧 Methodology</h2>

	<h3 style='color: #3498db; margin-top: 20px;'>1. Feature Engineering</h3>
	<div style='background-color: #1a1a1a; color: #ffffff; padding: 15px; border-radius: 5px; margin: 10px 0;'>
	<h4 style='color: #3498db;'>Text Features (TF-IDF)</h4>
	<ul style='line-height: 1.8;'>
	<li>Max Features: 5,000</li>
	<li>N-gram Range: (1, 3) - captures single words, bigrams, and trigrams</li>
	<li>Min Document Frequency: 2 - removes very rare terms</li>
	<li>Max Document Frequency: 0.95 - removes overly common terms</li>
	<li>Sublinear TF: True - applies log scaling to term frequency</li>
	</ul>
	</div>

	<div style='background-color: #1a1a1a; color: #3498db; padding: 15px; border-radius: 5px; margin: 10px 0;'>
	<h4 style='color: #3498db;'>Mathematical Symbol Features</h4>
	<ul style='line-height: 1.8;'>
	<li>Fractions: Presence of division operations</li>
	<li>Square roots: √ or sqrt notation</li>
	<li>Exponents: Powers and exponential functions</li>
	<li>Integrals: ∫ or integration notation</li>
	<li>Derivatives: Prime notation or derivative symbols</li>
	<li>Summations: ∑ or sum notation</li>
	<li>Trigonometric: sin, cos, tan functions</li>
	<li>Inequalities: <, >, ≤, ≥ symbols</li>
	<li>Absolute values: \| \| notation</li>
	<li>Pi (π) presence</li>
	</ul>
	</div>

	<div style='background-color: #1a1a1a; color: #3498db; padding: 15px; border-radius: 5px; margin: 10px 0;'>
	<h4 style='color: #3498db;'>Numeric Features</h4>
	<ul style='line-height: 1.8;'>
	<li>Number count in the problem</li>
	<li>Presence of large numbers (> 100)</li>
	<li>Presence of decimal numbers</li>
	<li>Presence of negative numbers</li>
	<li>Average value of numbers in the problem</li>
	</ul>
	</div>

	<h3 style='color: #3498db; margin-top: 20px;'>2. Text Preprocessing</h3>
	<ol style='line-height: 2; color: #555;'>
	<li><strong>LaTeX Cleaning:</strong> Remove or simplify LaTeX commands while preserving meaning</li>
	<li><strong>Lowercasing:</strong> Convert all text to lowercase for uniformity</li>
	<li><strong>Special Character Removal:</strong> Remove non-alphanumeric characters (except those in formulas)</li>
	<li><strong>Stop Word Removal:</strong> Remove common English words that don't add value</li>
	<li><strong>Lemmatization:</strong> Reduce words to their base form (e.g., "running" → "run")</li>
	</ol>

	<h3 style='color: #3498db; margin-top: 20px;'>3. Models Evaluated</h3>
	<div style='background-color: #1a1a1a; color: #ffffff; padding: 15px; border-radius: 5px; margin: 10px 0;'>
	<table style='width: 100%; border-collapse: collapse;'>
	<tr style='background-color: #16a085; color: white;'>
	<th style='padding: 10px; text-align: left;'>Model</th>
	<th style='padding: 10px; text-align: left;'>Description</th>
	<th style='padding: 10px; text-align: left;'>Key Parameters</th>
	</tr>
	<tr style='background-color: #2d2d2d;'>
	<td style='padding: 10px; border: 1px solid #444;'><strong>Naive Bayes</strong></td>
	<td style='padding: 10px; border: 1px solid #444;'>Probabilistic classifier based on Bayes' theorem</td>
	<td style='padding: 10px; border: 1px solid #444;'>alpha=0.1</td>
	</tr>
	<tr style='background-color: #1a1a1a;'>
	<td style='padding: 10px; border: 1px solid #444;'><strong>Logistic Regression</strong></td>
	<td style='padding: 10px; border: 1px solid #444;'>Linear model with logistic function</td>
	<td style='padding: 10px; border: 1px solid #444;'>C=1.0, solver='saga', max_iter=1000</td>
	</tr>
	<tr style='background-color: #2d2d2d;'>
	<td style='padding: 10px; border: 1px solid #444;'><strong>SVM</strong></td>
	<td style='padding: 10px; border: 1px solid #444;'>Support Vector Machine with linear kernel</td>
	<td style='padding: 10px; border: 1px solid #444;'>kernel='linear', C=1.0</td>
	</tr>
	<tr style='background-color: #1a1a1a;'>
	<td style='padding: 10px; border: 1px solid #444;'><strong>Random Forest</strong></td>
	<td style='padding: 10px; border: 1px solid #444;'>Ensemble of decision trees</td>
	<td style='padding: 10px; border: 1px solid #444;'>n_estimators=200, max_depth=30</td>
	</tr>
	<tr style='background-color: #2d2d2d;'>
	<td style='padding: 10px; border: 1px solid #444;'><strong>Gradient Boosting</strong></td>
	<td style='padding: 10px; border: 1px solid #444;'>Sequential ensemble method</td>
	<td style='padding: 10px; border: 1px solid #444;'>n_estimators=100, learning_rate=0.1</td>
	</tr>
	</table>
	</div>

	<h2 style='color: #3498db; margin-top: 30px;'>Results & Performance</h2>
	<div style='background-color: #1a1a1a; color: #ffffff; padding: 20px; border-radius: 10px; border-left: 5px solid #ffc107; margin: 20px 0;'>
	<h3 style='color: #ffc107;'>🏆 Best Model: Random Forest / Gradient Boosting</h3>
	<ul style='line-height: 2;'>
	<li><strong>Test Accuracy:</strong> ~85-90%</li>
	<li><strong>F1-Score (Weighted):</strong> ~0.85-0.90</li>
	<li><strong>Training Time:</strong> ~30-60 seconds</li>
	</ul>
	</div>

	<h3 style='color: #3498db; margin-top: 20px;'>Per-Topic Performance Insights</h3>
	<ul style='line-height: 2; color: #555;'>
	<li><strong>Strongest Topics:</strong> Algebra, Number Theory (clear mathematical patterns)</li>
	<li><strong>Challenging Topics:</strong> Precalculus, Intermediate Algebra (overlapping concepts)</li>
	<li><strong>Common Confusions:</strong> Calculus ↔ Precalculus, Algebra ↔ Intermediate Algebra</li>
	</ul>

	<h2 style='color: #3498db; margin-top: 30px;'>Technical Stack</h2>
	<ul style='line-height: 2; color: #555;'>
	<li><strong>Machine Learning:</strong> scikit-learn</li>
	<li><strong>NLP:</strong> NLTK, TF-IDF Vectorization</li>
	<li><strong>Feature Engineering:</strong> Custom mathematical feature extractors</li>
	<li><strong>Interface:</strong> Gradio</li>
	<li><strong>AI Integration:</strong> Google Gemini API</li>
	<li><strong>Data Processing:</strong> Pandas, NumPy</li>
	<li><strong>Deployment:</strong> Docker, HuggingFace Spaces</li>
	</ul>

	<h2 style='color: #3498db; margin-top: 30px;'>Insights</h2>
	<ol style='line-height: 2; color: #555;'>
	<li><strong>Domain-Specific Features Matter:</strong> Mathematical symbol detection significantly improved classification accuracy</li>
	<li><strong>Text Preprocessing is Critical:</strong> Proper LaTeX handling prevented information loss</li>
	<li><strong>Ensemble Methods Excel:</strong> Random Forest and Gradient Boosting outperformed simpler models</li>
	<li><strong>Class Imbalance:</strong> Using class weights helped balance performance across topics</li>
	<li><strong>Feature Scaling:</strong> Normalizing numeric features improved model stability</li>
	</ol>

	<div style='background-color: #1a1a1a; color: #ffffff; padding: 20px; border-radius: 10px; margin-top: 30px; border-left: 5px solid #28a745;'>
	<h3 style='color: #28a745;'>✅ Conclusion</h3>
	<p style='line-height: 1.8;'>
	This project successfully demonstrates the application of machine learning and NLP techniques
	to mathematical problem classification. By combining traditional feature engineering with modern
	AI capabilities, we've created a practical tool that can help students and educators quickly
	categorize and solve mathematical problems.
	</p>
	</div>
	</div>
	"""
	return docs_html


	# Create Gradio interface
	def create_interface():
	"""Create the Gradio interface"""

	with gr.Blocks(title="AI Math Question Classifier") as demo:
	gr.Markdown("""
	# AI Math Question Classifier & Solver
	### Classify math questions by topic and get AI-powered solutions
	""")

	with gr.Tabs() as tabs:
	# Home Tab
	with gr.Tab("Home"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### Enter Your Math Question")
	question_input = gr.Textbox(
	label="Math Question",
	placeholder="Example: Find the derivative of f(x) = x^2 + 3x + 2",
	lines=6,
	max_lines=12
	)

	gr.Markdown("### Or Upload an Image")
	image_input = gr.Image(
	label="Math Problem Image",
	type="filepath",
	sources=["upload", "clipboard"]
	)

	submit_btn = gr.Button("Classify & Solve", variant="primary", size="lg")

	with gr.Column(scale=1):
	gr.Markdown("### Results")
	classification_output = gr.HTML(label="Topic Classification")

	gr.Markdown("---")

	solution_output = gr.HTML(label="AI Solution")

	submit_btn.click(
	fn=predict_and_solve,
	inputs=[question_input, image_input],
	outputs=[classification_output, solution_output]
	)

	# Docs Tab
	with gr.Tab("Documentation"):
	gr.HTML(create_docs_content())

	gr.Markdown("""
	---
	<div style='text-align: center; color: #666;'>
	<p>Built using Gradio, scikit-learn, and Google Gemini</p>
	<p>Deployed on HuggingFace Spaces \| Docker-ready</p>
	</div>
	""")

	return demo


	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)