Spaces:

heerjtdev
/

example

Running

App Files Files Community

example / app.py

heerjtdev

Update app.py

d818498 verified 1 day ago

raw

history blame contribute delete

31.3 kB

	# import gradio as gr
	# import PyPDF2
	# import re
	# import json
	# from typing import List, Dict
	# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	# import torch
	# import tempfile
	# import os

	# # Initialize the model and tokenizer directly
	# print("Loading models... This may take a minute on first run.")

	# model_name = "valhalla/t5-small-qg-hl"
	# tokenizer = AutoTokenizer.from_pretrained(model_name)
	# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

	# # Set to evaluation mode and CPU
	# model.eval()
	# device = torch.device("cpu")
	# model.to(device)

	# def generate_questions(context: str, answer: str, max_length: int = 128) -> str:
	# """Generate a question using T5 model."""
	# try:
	# # Format: "generate question: <hl> answer <hl> context"
	# input_text = f"generate question: <hl> {answer} <hl> {context}"

	# # Tokenize
	# inputs = tokenizer(
	# input_text,
	# return_tensors="pt",
	# max_length=512,
	# truncation=True,
	# padding=True
	# ).to(device)

	# # Generate
	# with torch.no_grad():
	# outputs = model.generate(
	# **inputs,
	# max_length=max_length,
	# num_beams=4,
	# early_stopping=True,
	# do_sample=True,
	# temperature=0.7
	# )

	# # Decode
	# question = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# # Clean up
	# question = re.sub(r'^(question:\|q:)', '', question, flags=re.IGNORECASE).strip()

	# return question if len(question) > 10 else ""

	# except Exception as e:
	# print(f"Error generating question: {e}")
	# return ""

	# def extract_text_from_pdf(pdf_file) -> str:
	# """Extract text from uploaded PDF file."""
	# text = ""
	# try:
	# if isinstance(pdf_file, str):
	# pdf_reader = PyPDF2.PdfReader(pdf_file)
	# else:
	# pdf_reader = PyPDF2.PdfReader(pdf_file)

	# for page in pdf_reader.pages:
	# page_text = page.extract_text()
	# if page_text:
	# text += page_text + "\n"
	# except Exception as e:
	# return f"Error reading PDF: {str(e)}"

	# return text

	# def clean_text(text: str) -> str:
	# """Clean and preprocess extracted text."""
	# # Remove excessive whitespace
	# text = re.sub(r'\s+', ' ', text)
	# # Remove special characters but keep sentence structure
	# text = re.sub(r'[^\w\s.,;!?-]', '', text)
	# return text.strip()

	# def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[str]:
	# """Split text into overlapping chunks for processing."""
	# sentences = re.split(r'(?<=[.!?])\s+', text)
	# chunks = []
	# current_chunk = ""

	# for sentence in sentences:
	# if len(current_chunk) + len(sentence) < max_chunk_size:
	# current_chunk += " " + sentence
	# else:
	# if current_chunk:
	# chunks.append(current_chunk.strip())
	# current_chunk = sentence

	# if current_chunk:
	# chunks.append(current_chunk.strip())

	# # Add overlap between chunks for context
	# overlapped_chunks = []
	# for i, chunk in enumerate(chunks):
	# if i > 0 and overlap > 0:
	# prev_sentences = chunks[i-1].split('. ')
	# overlap_text = '. '.join(prev_sentences[-2:]) if len(prev_sentences) > 1 else chunks[i-1][-overlap:]
	# chunk = overlap_text + " " + chunk
	# overlapped_chunks.append(chunk)

	# return overlapped_chunks

	# def generate_qa_pairs(chunk: str, num_questions: int = 2) -> List[Dict[str, str]]:
	# """Generate question-answer pairs from a text chunk."""
	# flashcards = []

	# # Skip chunks that are too short
	# words = chunk.split()
	# if len(words) < 20:
	# return []

	# try:
	# # Split into sentences to use as answers
	# sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20]

	# if len(sentences) < 1:
	# return []

	# # Generate questions for different sentences
	# for i in range(min(num_questions, len(sentences))):
	# answer = sentences[i]

	# # Skip very short answers
	# if len(answer.split()) < 3:
	# continue

	# question = generate_questions(chunk, answer)

	# if question and question != answer: # Make sure they're different
	# flashcards.append({
	# "question": question,
	# "answer": answer,
	# "context": chunk[:200] + "..." if len(chunk) > 200 else chunk
	# })

	# except Exception as e:
	# print(f"Error generating QA: {e}")

	# return flashcards

	# def process_pdf(pdf_file, questions_per_chunk: int = 2, max_chunks: int = 20):
	# """Main processing function."""
	# if pdf_file is None:
	# return "Please upload a PDF file.", "", "", "Your flashcards will appear here..."

	# try:
	# # Extract text
	# yield "📄 Extracting text from PDF...", "", "", "Processing..."
	# raw_text = extract_text_from_pdf(pdf_file)

	# if raw_text.startswith("Error"):
	# yield raw_text, "", "", "Error occurred"
	# return

	# if len(raw_text.strip()) < 100:
	# yield "PDF appears to be empty or contains no extractable text.", "", "", "Error occurred"
	# return

	# # Clean text
	# yield "🧹 Cleaning text...", "", "", "Processing..."
	# cleaned_text = clean_text(raw_text)

	# # Chunk text
	# yield "✂️ Chunking text into sections...", "", "", "Processing..."
	# chunks = chunk_text(cleaned_text)

	# # Limit chunks for CPU performance
	# chunks = chunks[:max_chunks]

	# # Generate flashcards
	# all_flashcards = []
	# total_chunks = len(chunks)

	# for i, chunk in enumerate(chunks):
	# progress = f"🎴 Generating flashcards... ({i+1}/{total_chunks} chunks processed)"
	# yield progress, "", "", "Processing..."

	# cards = generate_qa_pairs(chunk, questions_per_chunk)
	# all_flashcards.extend(cards)

	# if not all_flashcards:
	# yield "Could not generate flashcards from this PDF. Try a PDF with more textual content.", "", "", "No flashcards generated"
	# return

	# # Format output
	# yield "✅ Finalizing...", "", "", "Almost done..."

	# # Create formatted display
	# display_text = format_flashcards_display(all_flashcards)

	# # Create JSON download
	# json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False)

	# # Create Anki/CSV format
	# csv_lines = ["Question,Answer"]
	# for card in all_flashcards:
	# q = card['question'].replace('"', '""')
	# a = card['answer'].replace('"', '""')
	# csv_lines.append(f'"{q}","{a}"')
	# csv_output = "\n".join(csv_lines)

	# # FINAL OUTPUT - this updates all components
	# yield "✅ Done! Generated {} flashcards".format(len(all_flashcards)), csv_output, json_output, display_text

	# except Exception as e:
	# error_msg = f"Error processing PDF: {str(e)}"
	# print(error_msg)
	# yield error_msg, "", "", error_msg

	# def format_flashcards_display(flashcards: List[Dict]) -> str:
	# """Format flashcards for nice display."""
	# lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"]

	# for i, card in enumerate(flashcards, 1):
	# lines.append(f"### Card {i}")
	# lines.append(f"Q: {card['question']}")
	# lines.append(f"A: {card['answer']}")
	# lines.append(f"Context: {card['context'][:100]}...\n")
	# lines.append("---\n")

	# return "\n".join(lines)

	# def create_sample_flashcard():
	# """Create a sample flashcard for demo purposes."""
	# sample = [{
	# "question": "What is the capital of France?",
	# "answer": "Paris is the capital and most populous city of France.",
	# "context": "Paris is the capital and most populous city of France..."
	# }]
	# return format_flashcards_display(sample)

	# # Custom CSS for better styling
	# custom_css = """
	# .flashcard-container {
	# border: 2px solid #e0e0e0;
	# border-radius: 10px;
	# padding: 20px;
	# margin: 10px 0;
	# background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	# color: white;
	# }
	# .question {
	# font-size: 1.2em;
	# font-weight: bold;
	# margin-bottom: 10px;
	# }
	# .answer {
	# font-size: 1em;
	# opacity: 0.9;
	# }
	# """

	# # Gradio Interface
	# with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
	# gr.Markdown("""
	# # 📚 PDF to Flashcards Generator

	# Upload any PDF document and automatically generate study flashcards (Q&A pairs) using AI.

	# Features:
	# - 🧠 Uses local CPU-friendly AI (no GPU needed)
	# - 📄 Extracts text from any PDF
	# - ✂️ Intelligently chunks content
	# - 🎴 Generates question-answer pairs
	# - 💾 Export to CSV (Anki-compatible) or JSON

	# Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.
	# """)

	# with gr.Row():
	# with gr.Column(scale=1):
	# pdf_input = gr.File(
	# label="Upload PDF",
	# file_types=[".pdf"],
	# type="filepath"
	# )

	# with gr.Row():
	# questions_per_chunk = gr.Slider(
	# minimum=1,
	# maximum=5,
	# value=2,
	# step=1,
	# label="Questions per section"
	# )
	# max_chunks = gr.Slider(
	# minimum=5,
	# maximum=50,
	# value=20,
	# step=5,
	# label="Max sections to process"
	# )

	# process_btn = gr.Button("🚀 Generate Flashcards", variant="primary")

	# gr.Markdown("""
	# ### 💡 Tips:
	# - Text-based PDFs work best (scanned images won't work)
	# - Academic papers and articles work great
	# - Adjust "Questions per section" based on content density
	# """)

	# with gr.Column(scale=2):
	# status_text = gr.Textbox(
	# label="Status",
	# value="Ready to process PDF...",
	# interactive=False
	# )

	# output_display = gr.Markdown(
	# label="Generated Flashcards",
	# value="Your flashcards will appear here..."
	# )

	# with gr.Row():
	# with gr.Column():
	# csv_output = gr.Textbox(
	# label="CSV Format (for Anki import)",
	# lines=10,
	# visible=True
	# )
	# gr.Markdown("Copy the CSV content and save as `.csv` file to import into Anki")

	# with gr.Column():
	# json_output = gr.Textbox(
	# label="JSON Format",
	# lines=10,
	# visible=True
	# )
	# gr.Markdown("Raw JSON data for custom applications")

	# # FIXED: Direct binding without the broken .then() chain
	# process_btn.click(
	# fn=process_pdf,
	# inputs=[pdf_input, questions_per_chunk, max_chunks],
	# outputs=[status_text, csv_output, json_output, output_display]
	# )

	# # Example section
	# gr.Markdown("---")
	# gr.Markdown("### 🎯 Example Output Format")
	# gr.Markdown(create_sample_flashcard())

	# if __name__ == "__main__":
	# demo.launch()














	import gradio as gr
	import PyPDF2
	import re
	import json
	from typing import List, Dict
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import torch
	import tempfile
	import os

	# Initialize the model and tokenizer directly
	print("Loading models... This may take a minute on first run.")

	model_name = "valhalla/t5-small-qg-hl"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

	# Set to evaluation mode and CPU
	model.eval()
	device = torch.device("cpu")
	model.to(device)

	def extract_key_phrases(text: str) -> List[str]:
	"""Extract potential answer candidates from text."""
	# Look for noun phrases, named entities, and important concepts
	candidates = []

	# Pattern for capitalized words/phrases (potential named entities)
	capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
	candidates.extend(capitalized[:3])

	# Pattern for technical terms or concepts (words with specific patterns)
	# Look for phrases like "the process of X", "the concept of X", etc.
	concept_patterns = [
	r'(?:process\|method\|technique\|approach\|concept\|theory\|principle\|system) of ([^,.]{10,50})',
	r'(?:known as\|called\|termed\|referred to as) ([^,.]{5,40})',
	r'(?:is\|are\|was\|were) (\w+(?:\s+\w+){1,4}) (?:that\|which\|who)',
	]

	for pattern in concept_patterns:
	matches = re.findall(pattern, text, re.IGNORECASE)
	candidates.extend(matches[:2])

	# Clean and deduplicate
	candidates = [c.strip() for c in candidates if len(c.strip()) > 5]
	return list(dict.fromkeys(candidates))[:5] # Remove duplicates, keep order

	def generate_questions(context: str, answer: str, question_type: str = "what", max_length: int = 128) -> str:
	"""Generate a question using T5 model with specified type."""
	try:
	# Format: "generate question: <hl> answer <hl> context"
	input_text = f"generate question: <hl> {answer} <hl> {context}"

	# Tokenize
	inputs = tokenizer(
	input_text,
	return_tensors="pt",
	max_length=512,
	truncation=True,
	padding=True
	).to(device)

	# Generate with different parameters based on question type
	temperature = 0.7 if question_type == "what" else 0.85
	num_beams = 4 if question_type == "what" else 5

	# Generate
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_length=max_length,
	num_beams=num_beams,
	early_stopping=True,
	do_sample=True,
	temperature=temperature
	)

	# Decode
	question = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Clean up
	question = re.sub(r'^(question:\|q:)', '', question, flags=re.IGNORECASE).strip()

	# Post-process to improve question quality
	question = improve_question(question, answer, context, question_type)

	return question if len(question) > 10 else ""

	except Exception as e:
	print(f"Error generating question: {e}")
	return ""

	def improve_question(question: str, answer: str, context: str, question_type: str) -> str:
	"""Post-process generated questions to improve quality and add variety."""

	# Ensure question ends with question mark
	if not question.endswith('?'):
	question = question.rstrip('.') + '?'

	# Capitalize first letter
	question = question[0].upper() + question[1:] if question else question

	# Try to transform to why/how questions if specified
	if question_type == "why" and not question.lower().startswith("why"):
	# Try to convert to why question
	if re.search(r'\b(is\|are\|was\|were\|does\|do\|did)\b', question, re.IGNORECASE):
	question = create_why_question(question, answer, context)

	elif question_type == "how" and not question.lower().startswith("how"):
	# Try to convert to how question
	if re.search(r'\b(does\|do\|did\|can\|could)\b', question, re.IGNORECASE):
	question = create_how_question(question, answer, context)

	return question

	def create_why_question(base_question: str, answer: str, context: str) -> str:
	"""Transform or create a 'why' question."""

	# Look for causal indicators in the context
	causal_patterns = [
	r'because ([^,.]{10,60})',
	r'due to ([^,.]{10,60})',
	r'as a result of ([^,.]{10,60})',
	r'(?:leads to\|causes\|results in) ([^,.]{10,60})',
	r'in order to ([^,.]{10,60})'
	]

	for pattern in causal_patterns:
	match = re.search(pattern, context, re.IGNORECASE)
	if match:
	# Extract the subject from context
	subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+(?:is\|are\|was\|were\|does\|do)', context)
	if subject_match:
	subject = subject_match.group(1)
	return f"Why does {subject.lower()} occur?"

	# Fallback: create a generic why question
	# Extract main subject from answer
	words = answer.split()
	if len(words) > 3:
	return f"Why is {' '.join(words[:4])}... important?"

	return base_question

	def create_how_question(base_question: str, answer: str, context: str) -> str:
	"""Transform or create a 'how' question."""

	# Look for process indicators
	process_patterns = [
	r'(process\|method\|procedure\|technique\|approach) (?:of\|for\|to) ([^,.]{10,60})',
	r'by ([^,.]{10,60})',
	r'through ([^,.]{10,60})'
	]

	for pattern in process_patterns:
	match = re.search(pattern, context, re.IGNORECASE)
	if match:
	if len(match.groups()) > 1:
	process = match.group(2)
	return f"How does {process.lower()} work?"
	else:
	process = match.group(1)
	return f"How is {process.lower()} achieved?"

	# Fallback: create a generic how question
	verbs = re.findall(r'\b(works?\|functions?\|operates?\|performs?\|executes?)\b', context, re.IGNORECASE)
	if verbs:
	subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+' + verbs[0], context, re.IGNORECASE)
	if subject_match:
	subject = subject_match.group(1)
	return f"How does {subject.lower()} {verbs[0].lower()}?"

	return base_question

	def extract_text_from_pdf(pdf_file) -> str:
	"""Extract text from uploaded PDF file."""
	text = ""
	try:
	if isinstance(pdf_file, str):
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	else:
	pdf_reader = PyPDF2.PdfReader(pdf_file)

	for page in pdf_reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	except Exception as e:
	return f"Error reading PDF: {str(e)}"

	return text

	def clean_text(text: str) -> str:
	"""Clean and preprocess extracted text."""
	# Remove excessive whitespace
	text = re.sub(r'\s+', ' ', text)
	# Remove special characters but keep sentence structure
	text = re.sub(r'[^\w\s.,;!?-]', '', text)
	return text.strip()

	def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[str]:
	"""Split text into overlapping chunks for processing."""
	sentences = re.split(r'(?<=[.!?])\s+', text)
	chunks = []
	current_chunk = ""

	for sentence in sentences:
	if len(current_chunk) + len(sentence) < max_chunk_size:
	current_chunk += " " + sentence
	else:
	if current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = sentence

	if current_chunk:
	chunks.append(current_chunk.strip())

	# Add overlap between chunks for context
	overlapped_chunks = []
	for i, chunk in enumerate(chunks):
	if i > 0 and overlap > 0:
	prev_sentences = chunks[i-1].split('. ')
	overlap_text = '. '.join(prev_sentences[-2:]) if len(prev_sentences) > 1 else chunks[i-1][-overlap:]
	chunk = overlap_text + " " + chunk
	overlapped_chunks.append(chunk)

	return overlapped_chunks

	def generate_qa_pairs(chunk: str, num_questions: int = 3) -> List[Dict[str, str]]:
	"""Generate question-answer pairs from a text chunk with variety."""
	flashcards = []

	# Skip chunks that are too short
	words = chunk.split()
	if len(words) < 20:
	return []

	try:
	# Extract key phrases for answers
	key_phrases = extract_key_phrases(chunk)

	# Also use sentences as potential answers
	sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20]

	# Combine both sources
	answer_candidates = key_phrases + sentences[:2]

	if len(answer_candidates) < 1:
	return []

	# Define question types to generate
	question_types = ["what", "why", "how"]

	# Generate diverse questions
	questions_generated = 0
	for i, answer in enumerate(answer_candidates):
	if questions_generated >= num_questions:
	break

	# Skip very short answers
	if len(answer.split()) < 3:
	continue

	# Cycle through question types
	q_type = question_types[i % len(question_types)]

	question = generate_questions(chunk, answer, question_type=q_type)

	if question and question != answer: # Make sure they're different
	flashcards.append({
	"question": question,
	"answer": answer,
	"context": chunk[:200] + "..." if len(chunk) > 200 else chunk,
	"type": q_type
	})
	questions_generated += 1

	except Exception as e:
	print(f"Error generating QA: {e}")

	return flashcards

	def process_pdf(pdf_file, questions_per_chunk: int = 3, max_chunks: int = 20):
	"""Main processing function."""
	if pdf_file is None:
	return "Please upload a PDF file.", "", "", "Your flashcards will appear here..."

	try:
	# Extract text
	yield "📄 Extracting text from PDF...", "", "", "Processing..."
	raw_text = extract_text_from_pdf(pdf_file)

	if raw_text.startswith("Error"):
	yield raw_text, "", "", "Error occurred"
	return

	if len(raw_text.strip()) < 100:
	yield "PDF appears to be empty or contains no extractable text.", "", "", "Error occurred"
	return

	# Clean text
	yield "🧹 Cleaning text...", "", "", "Processing..."
	cleaned_text = clean_text(raw_text)

	# Chunk text
	yield "✂️ Chunking text into sections...", "", "", "Processing..."
	chunks = chunk_text(cleaned_text)

	# Limit chunks for CPU performance
	chunks = chunks[:max_chunks]

	# Generate flashcards
	all_flashcards = []
	total_chunks = len(chunks)

	for i, chunk in enumerate(chunks):
	progress = f"🎴 Generating flashcards... ({i+1}/{total_chunks} chunks processed)"
	yield progress, "", "", "Processing..."

	cards = generate_qa_pairs(chunk, questions_per_chunk)
	all_flashcards.extend(cards)

	if not all_flashcards:
	yield "Could not generate flashcards from this PDF. Try a PDF with more textual content.", "", "", "No flashcards generated"
	return

	# Format output
	yield "✅ Finalizing...", "", "", "Almost done..."

	# Create formatted display
	display_text = format_flashcards_display(all_flashcards)

	# Create JSON download
	json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False)

	# Create Anki/CSV format
	csv_lines = ["Question,Answer,Type"]
	for card in all_flashcards:
	q = card['question'].replace('"', '""')
	a = card['answer'].replace('"', '""')
	t = card.get('type', 'what')
	csv_lines.append(f'"{q}","{a}","{t}"')
	csv_output = "\n".join(csv_lines)

	# FINAL OUTPUT - this updates all components
	stats = f"✅ Done! Generated {len(all_flashcards)} flashcards ("
	types_count = {}
	for card in all_flashcards:
	t = card.get('type', 'what')
	types_count[t] = types_count.get(t, 0) + 1
	stats += ", ".join([f"{count} {qtype}" for qtype, count in types_count.items()]) + ")"

	yield stats, csv_output, json_output, display_text

	except Exception as e:
	error_msg = f"Error processing PDF: {str(e)}"
	print(error_msg)
	yield error_msg, "", "", error_msg

	def format_flashcards_display(flashcards: List[Dict]) -> str:
	"""Format flashcards for nice display."""
	lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"]

	# Count by type
	types_count = {}
	for card in flashcards:
	t = card.get('type', 'what')
	types_count[t] = types_count.get(t, 0) + 1

	lines.append(f"Breakdown: {', '.join([f'{count} {qtype.upper()}' for qtype, count in types_count.items()])}\n")
	lines.append("---\n")

	for i, card in enumerate(flashcards, 1):
	qtype = card.get('type', 'what').upper()
	emoji = "❓" if qtype == "WHAT" else "🤔" if qtype == "WHY" else "🔧"

	lines.append(f"### {emoji} Card {i} - {qtype}")
	lines.append(f"Q: {card['question']}")
	lines.append(f"A: {card['answer']}")
	lines.append(f"Context: {card['context'][:100]}...\n")
	lines.append("---\n")

	return "\n".join(lines)

	def create_sample_flashcard():
	"""Create a sample flashcard for demo purposes."""
	sample = [
	{
	"question": "What is photosynthesis?",
	"answer": "Photosynthesis is the process by which plants convert sunlight into energy.",
	"context": "Photosynthesis is the process by which plants convert sunlight into energy...",
	"type": "what"
	},
	{
	"question": "Why do plants need chlorophyll?",
	"answer": "Chlorophyll absorbs light energy needed for photosynthesis.",
	"context": "Chlorophyll absorbs light energy needed for photosynthesis...",
	"type": "why"
	},
	{
	"question": "How do plants convert light into chemical energy?",
	"answer": "Through the process of photosynthesis in the chloroplasts.",
	"context": "Through the process of photosynthesis in the chloroplasts...",
	"type": "how"
	}
	]
	return format_flashcards_display(sample)

	# Custom CSS for better styling
	custom_css = """
	.flashcard-container {
	border: 2px solid #e0e0e0;
	border-radius: 10px;
	padding: 20px;
	margin: 10px 0;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	}
	.question {
	font-size: 1.2em;
	font-weight: bold;
	margin-bottom: 10px;
	}
	.answer {
	font-size: 1em;
	opacity: 0.9;
	}
	"""

	# Gradio Interface
	with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
	gr.Markdown("""
	# 📚 PDF to Flashcards Generator (Enhanced)

	Upload any PDF document and automatically generate study flashcards with What, Why, and How questions using AI.

	✨ New Features:
	- 🎯 Generates What questions (factual)
	- 🤔 Generates Why questions (reasoning)
	- 🔧 Generates How questions (process)
	- 📊 Improved question quality and variety
	- 🧠 Better answer extraction

	Core Features:
	- 🧠 Uses local CPU-friendly AI (no GPU needed)
	- 📄 Extracts text from any PDF
	- ✂️ Intelligently chunks content
	- 🎴 Generates diverse question-answer pairs
	- 💾 Export to CSV (Anki-compatible) or JSON

	Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	pdf_input = gr.File(
	label="Upload PDF",
	file_types=[".pdf"],
	type="filepath"
	)

	with gr.Row():
	questions_per_chunk = gr.Slider(
	minimum=1,
	maximum=6,
	value=3,
	step=1,
	label="Questions per section"
	)
	max_chunks = gr.Slider(
	minimum=5,
	maximum=50,
	value=20,
	step=5,
	label="Max sections to process"
	)

	process_btn = gr.Button("🚀 Generate Flashcards", variant="primary")

	gr.Markdown("""
	### 💡 Tips:
	- Text-based PDFs work best (scanned images won't work)
	- Academic papers and articles work great
	- Adjust "Questions per section" for more variety
	- Higher questions per section = more Why/How questions
	""")

	with gr.Column(scale=2):
	status_text = gr.Textbox(
	label="Status",
	value="Ready to process PDF...",
	interactive=False
	)

	output_display = gr.Markdown(
	label="Generated Flashcards",
	value="Your flashcards will appear here..."
	)

	with gr.Row():
	with gr.Column():
	csv_output = gr.Textbox(
	label="CSV Format (for Anki import)",
	lines=10,
	visible=True
	)
	gr.Markdown("Copy the CSV content and save as `.csv` file to import into Anki")

	with gr.Column():
	json_output = gr.Textbox(
	label="JSON Format",
	lines=10,
	visible=True
	)
	gr.Markdown("Raw JSON data for custom applications")

	# Direct binding
	process_btn.click(
	fn=process_pdf,
	inputs=[pdf_input, questions_per_chunk, max_chunks],
	outputs=[status_text, csv_output, json_output, output_display]
	)

	# Example section
	gr.Markdown("---")
	gr.Markdown("### 🎯 Example Output Format")
	gr.Markdown(create_sample_flashcard())

	if __name__ == "__main__":
	demo.launch()