example / app.py
heerjtdev's picture
Update app.py
d818498 verified
# import gradio as gr
# import PyPDF2
# import re
# import json
# from typing import List, Dict
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# import torch
# import tempfile
# import os
# # Initialize the model and tokenizer directly
# print("Loading models... This may take a minute on first run.")
# model_name = "valhalla/t5-small-qg-hl"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# # Set to evaluation mode and CPU
# model.eval()
# device = torch.device("cpu")
# model.to(device)
# def generate_questions(context: str, answer: str, max_length: int = 128) -> str:
# """Generate a question using T5 model."""
# try:
# # Format: "generate question: <hl> answer <hl> context"
# input_text = f"generate question: <hl> {answer} <hl> {context}"
# # Tokenize
# inputs = tokenizer(
# input_text,
# return_tensors="pt",
# max_length=512,
# truncation=True,
# padding=True
# ).to(device)
# # Generate
# with torch.no_grad():
# outputs = model.generate(
# **inputs,
# max_length=max_length,
# num_beams=4,
# early_stopping=True,
# do_sample=True,
# temperature=0.7
# )
# # Decode
# question = tokenizer.decode(outputs[0], skip_special_tokens=True)
# # Clean up
# question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip()
# return question if len(question) > 10 else ""
# except Exception as e:
# print(f"Error generating question: {e}")
# return ""
# def extract_text_from_pdf(pdf_file) -> str:
# """Extract text from uploaded PDF file."""
# text = ""
# try:
# if isinstance(pdf_file, str):
# pdf_reader = PyPDF2.PdfReader(pdf_file)
# else:
# pdf_reader = PyPDF2.PdfReader(pdf_file)
# for page in pdf_reader.pages:
# page_text = page.extract_text()
# if page_text:
# text += page_text + "\n"
# except Exception as e:
# return f"Error reading PDF: {str(e)}"
# return text
# def clean_text(text: str) -> str:
# """Clean and preprocess extracted text."""
# # Remove excessive whitespace
# text = re.sub(r'\s+', ' ', text)
# # Remove special characters but keep sentence structure
# text = re.sub(r'[^\w\s.,;!?-]', '', text)
# return text.strip()
# def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[str]:
# """Split text into overlapping chunks for processing."""
# sentences = re.split(r'(?<=[.!?])\s+', text)
# chunks = []
# current_chunk = ""
# for sentence in sentences:
# if len(current_chunk) + len(sentence) < max_chunk_size:
# current_chunk += " " + sentence
# else:
# if current_chunk:
# chunks.append(current_chunk.strip())
# current_chunk = sentence
# if current_chunk:
# chunks.append(current_chunk.strip())
# # Add overlap between chunks for context
# overlapped_chunks = []
# for i, chunk in enumerate(chunks):
# if i > 0 and overlap > 0:
# prev_sentences = chunks[i-1].split('. ')
# overlap_text = '. '.join(prev_sentences[-2:]) if len(prev_sentences) > 1 else chunks[i-1][-overlap:]
# chunk = overlap_text + " " + chunk
# overlapped_chunks.append(chunk)
# return overlapped_chunks
# def generate_qa_pairs(chunk: str, num_questions: int = 2) -> List[Dict[str, str]]:
# """Generate question-answer pairs from a text chunk."""
# flashcards = []
# # Skip chunks that are too short
# words = chunk.split()
# if len(words) < 20:
# return []
# try:
# # Split into sentences to use as answers
# sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20]
# if len(sentences) < 1:
# return []
# # Generate questions for different sentences
# for i in range(min(num_questions, len(sentences))):
# answer = sentences[i]
# # Skip very short answers
# if len(answer.split()) < 3:
# continue
# question = generate_questions(chunk, answer)
# if question and question != answer: # Make sure they're different
# flashcards.append({
# "question": question,
# "answer": answer,
# "context": chunk[:200] + "..." if len(chunk) > 200 else chunk
# })
# except Exception as e:
# print(f"Error generating QA: {e}")
# return flashcards
# def process_pdf(pdf_file, questions_per_chunk: int = 2, max_chunks: int = 20):
# """Main processing function."""
# if pdf_file is None:
# return "Please upload a PDF file.", "", "", "Your flashcards will appear here..."
# try:
# # Extract text
# yield "πŸ“„ Extracting text from PDF...", "", "", "Processing..."
# raw_text = extract_text_from_pdf(pdf_file)
# if raw_text.startswith("Error"):
# yield raw_text, "", "", "Error occurred"
# return
# if len(raw_text.strip()) < 100:
# yield "PDF appears to be empty or contains no extractable text.", "", "", "Error occurred"
# return
# # Clean text
# yield "🧹 Cleaning text...", "", "", "Processing..."
# cleaned_text = clean_text(raw_text)
# # Chunk text
# yield "βœ‚οΈ Chunking text into sections...", "", "", "Processing..."
# chunks = chunk_text(cleaned_text)
# # Limit chunks for CPU performance
# chunks = chunks[:max_chunks]
# # Generate flashcards
# all_flashcards = []
# total_chunks = len(chunks)
# for i, chunk in enumerate(chunks):
# progress = f"🎴 Generating flashcards... ({i+1}/{total_chunks} chunks processed)"
# yield progress, "", "", "Processing..."
# cards = generate_qa_pairs(chunk, questions_per_chunk)
# all_flashcards.extend(cards)
# if not all_flashcards:
# yield "Could not generate flashcards from this PDF. Try a PDF with more textual content.", "", "", "No flashcards generated"
# return
# # Format output
# yield "βœ… Finalizing...", "", "", "Almost done..."
# # Create formatted display
# display_text = format_flashcards_display(all_flashcards)
# # Create JSON download
# json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False)
# # Create Anki/CSV format
# csv_lines = ["Question,Answer"]
# for card in all_flashcards:
# q = card['question'].replace('"', '""')
# a = card['answer'].replace('"', '""')
# csv_lines.append(f'"{q}","{a}"')
# csv_output = "\n".join(csv_lines)
# # FINAL OUTPUT - this updates all components
# yield "βœ… Done! Generated {} flashcards".format(len(all_flashcards)), csv_output, json_output, display_text
# except Exception as e:
# error_msg = f"Error processing PDF: {str(e)}"
# print(error_msg)
# yield error_msg, "", "", error_msg
# def format_flashcards_display(flashcards: List[Dict]) -> str:
# """Format flashcards for nice display."""
# lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"]
# for i, card in enumerate(flashcards, 1):
# lines.append(f"### Card {i}")
# lines.append(f"**Q:** {card['question']}")
# lines.append(f"**A:** {card['answer']}")
# lines.append(f"*Context: {card['context'][:100]}...*\n")
# lines.append("---\n")
# return "\n".join(lines)
# def create_sample_flashcard():
# """Create a sample flashcard for demo purposes."""
# sample = [{
# "question": "What is the capital of France?",
# "answer": "Paris is the capital and most populous city of France.",
# "context": "Paris is the capital and most populous city of France..."
# }]
# return format_flashcards_display(sample)
# # Custom CSS for better styling
# custom_css = """
# .flashcard-container {
# border: 2px solid #e0e0e0;
# border-radius: 10px;
# padding: 20px;
# margin: 10px 0;
# background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
# color: white;
# }
# .question {
# font-size: 1.2em;
# font-weight: bold;
# margin-bottom: 10px;
# }
# .answer {
# font-size: 1em;
# opacity: 0.9;
# }
# """
# # Gradio Interface
# with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
# gr.Markdown("""
# # πŸ“š PDF to Flashcards Generator
# Upload any PDF document and automatically generate study flashcards (Q&A pairs) using AI.
# **Features:**
# - 🧠 Uses local CPU-friendly AI (no GPU needed)
# - πŸ“„ Extracts text from any PDF
# - βœ‚οΈ Intelligently chunks content
# - 🎴 Generates question-answer pairs
# - πŸ’Ύ Export to CSV (Anki-compatible) or JSON
# *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.*
# """)
# with gr.Row():
# with gr.Column(scale=1):
# pdf_input = gr.File(
# label="Upload PDF",
# file_types=[".pdf"],
# type="filepath"
# )
# with gr.Row():
# questions_per_chunk = gr.Slider(
# minimum=1,
# maximum=5,
# value=2,
# step=1,
# label="Questions per section"
# )
# max_chunks = gr.Slider(
# minimum=5,
# maximum=50,
# value=20,
# step=5,
# label="Max sections to process"
# )
# process_btn = gr.Button("πŸš€ Generate Flashcards", variant="primary")
# gr.Markdown("""
# ### πŸ’‘ Tips:
# - Text-based PDFs work best (scanned images won't work)
# - Academic papers and articles work great
# - Adjust "Questions per section" based on content density
# """)
# with gr.Column(scale=2):
# status_text = gr.Textbox(
# label="Status",
# value="Ready to process PDF...",
# interactive=False
# )
# output_display = gr.Markdown(
# label="Generated Flashcards",
# value="Your flashcards will appear here..."
# )
# with gr.Row():
# with gr.Column():
# csv_output = gr.Textbox(
# label="CSV Format (for Anki import)",
# lines=10,
# visible=True
# )
# gr.Markdown("*Copy the CSV content and save as `.csv` file to import into Anki*")
# with gr.Column():
# json_output = gr.Textbox(
# label="JSON Format",
# lines=10,
# visible=True
# )
# gr.Markdown("*Raw JSON data for custom applications*")
# # FIXED: Direct binding without the broken .then() chain
# process_btn.click(
# fn=process_pdf,
# inputs=[pdf_input, questions_per_chunk, max_chunks],
# outputs=[status_text, csv_output, json_output, output_display]
# )
# # Example section
# gr.Markdown("---")
# gr.Markdown("### 🎯 Example Output Format")
# gr.Markdown(create_sample_flashcard())
# if __name__ == "__main__":
# demo.launch()
import gradio as gr
import PyPDF2
import re
import json
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import tempfile
import os
# Initialize the model and tokenizer directly
print("Loading models... This may take a minute on first run.")
model_name = "valhalla/t5-small-qg-hl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Set to evaluation mode and CPU
model.eval()
device = torch.device("cpu")
model.to(device)
def extract_key_phrases(text: str) -> List[str]:
"""Extract potential answer candidates from text."""
# Look for noun phrases, named entities, and important concepts
candidates = []
# Pattern for capitalized words/phrases (potential named entities)
capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
candidates.extend(capitalized[:3])
# Pattern for technical terms or concepts (words with specific patterns)
# Look for phrases like "the process of X", "the concept of X", etc.
concept_patterns = [
r'(?:process|method|technique|approach|concept|theory|principle|system) of ([^,.]{10,50})',
r'(?:known as|called|termed|referred to as) ([^,.]{5,40})',
r'(?:is|are|was|were) (\w+(?:\s+\w+){1,4}) (?:that|which|who)',
]
for pattern in concept_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
candidates.extend(matches[:2])
# Clean and deduplicate
candidates = [c.strip() for c in candidates if len(c.strip()) > 5]
return list(dict.fromkeys(candidates))[:5] # Remove duplicates, keep order
def generate_questions(context: str, answer: str, question_type: str = "what", max_length: int = 128) -> str:
"""Generate a question using T5 model with specified type."""
try:
# Format: "generate question: <hl> answer <hl> context"
input_text = f"generate question: <hl> {answer} <hl> {context}"
# Tokenize
inputs = tokenizer(
input_text,
return_tensors="pt",
max_length=512,
truncation=True,
padding=True
).to(device)
# Generate with different parameters based on question type
temperature = 0.7 if question_type == "what" else 0.85
num_beams = 4 if question_type == "what" else 5
# Generate
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=max_length,
num_beams=num_beams,
early_stopping=True,
do_sample=True,
temperature=temperature
)
# Decode
question = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Clean up
question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip()
# Post-process to improve question quality
question = improve_question(question, answer, context, question_type)
return question if len(question) > 10 else ""
except Exception as e:
print(f"Error generating question: {e}")
return ""
def improve_question(question: str, answer: str, context: str, question_type: str) -> str:
"""Post-process generated questions to improve quality and add variety."""
# Ensure question ends with question mark
if not question.endswith('?'):
question = question.rstrip('.') + '?'
# Capitalize first letter
question = question[0].upper() + question[1:] if question else question
# Try to transform to why/how questions if specified
if question_type == "why" and not question.lower().startswith("why"):
# Try to convert to why question
if re.search(r'\b(is|are|was|were|does|do|did)\b', question, re.IGNORECASE):
question = create_why_question(question, answer, context)
elif question_type == "how" and not question.lower().startswith("how"):
# Try to convert to how question
if re.search(r'\b(does|do|did|can|could)\b', question, re.IGNORECASE):
question = create_how_question(question, answer, context)
return question
def create_why_question(base_question: str, answer: str, context: str) -> str:
"""Transform or create a 'why' question."""
# Look for causal indicators in the context
causal_patterns = [
r'because ([^,.]{10,60})',
r'due to ([^,.]{10,60})',
r'as a result of ([^,.]{10,60})',
r'(?:leads to|causes|results in) ([^,.]{10,60})',
r'in order to ([^,.]{10,60})'
]
for pattern in causal_patterns:
match = re.search(pattern, context, re.IGNORECASE)
if match:
# Extract the subject from context
subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+(?:is|are|was|were|does|do)', context)
if subject_match:
subject = subject_match.group(1)
return f"Why does {subject.lower()} occur?"
# Fallback: create a generic why question
# Extract main subject from answer
words = answer.split()
if len(words) > 3:
return f"Why is {' '.join(words[:4])}... important?"
return base_question
def create_how_question(base_question: str, answer: str, context: str) -> str:
"""Transform or create a 'how' question."""
# Look for process indicators
process_patterns = [
r'(process|method|procedure|technique|approach) (?:of|for|to) ([^,.]{10,60})',
r'by ([^,.]{10,60})',
r'through ([^,.]{10,60})'
]
for pattern in process_patterns:
match = re.search(pattern, context, re.IGNORECASE)
if match:
if len(match.groups()) > 1:
process = match.group(2)
return f"How does {process.lower()} work?"
else:
process = match.group(1)
return f"How is {process.lower()} achieved?"
# Fallback: create a generic how question
verbs = re.findall(r'\b(works?|functions?|operates?|performs?|executes?)\b', context, re.IGNORECASE)
if verbs:
subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+' + verbs[0], context, re.IGNORECASE)
if subject_match:
subject = subject_match.group(1)
return f"How does {subject.lower()} {verbs[0].lower()}?"
return base_question
def extract_text_from_pdf(pdf_file) -> str:
"""Extract text from uploaded PDF file."""
text = ""
try:
if isinstance(pdf_file, str):
pdf_reader = PyPDF2.PdfReader(pdf_file)
else:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
except Exception as e:
return f"Error reading PDF: {str(e)}"
return text
def clean_text(text: str) -> str:
"""Clean and preprocess extracted text."""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep sentence structure
text = re.sub(r'[^\w\s.,;!?-]', '', text)
return text.strip()
def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[str]:
"""Split text into overlapping chunks for processing."""
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) < max_chunk_size:
current_chunk += " " + sentence
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk.strip())
# Add overlap between chunks for context
overlapped_chunks = []
for i, chunk in enumerate(chunks):
if i > 0 and overlap > 0:
prev_sentences = chunks[i-1].split('. ')
overlap_text = '. '.join(prev_sentences[-2:]) if len(prev_sentences) > 1 else chunks[i-1][-overlap:]
chunk = overlap_text + " " + chunk
overlapped_chunks.append(chunk)
return overlapped_chunks
def generate_qa_pairs(chunk: str, num_questions: int = 3) -> List[Dict[str, str]]:
"""Generate question-answer pairs from a text chunk with variety."""
flashcards = []
# Skip chunks that are too short
words = chunk.split()
if len(words) < 20:
return []
try:
# Extract key phrases for answers
key_phrases = extract_key_phrases(chunk)
# Also use sentences as potential answers
sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20]
# Combine both sources
answer_candidates = key_phrases + sentences[:2]
if len(answer_candidates) < 1:
return []
# Define question types to generate
question_types = ["what", "why", "how"]
# Generate diverse questions
questions_generated = 0
for i, answer in enumerate(answer_candidates):
if questions_generated >= num_questions:
break
# Skip very short answers
if len(answer.split()) < 3:
continue
# Cycle through question types
q_type = question_types[i % len(question_types)]
question = generate_questions(chunk, answer, question_type=q_type)
if question and question != answer: # Make sure they're different
flashcards.append({
"question": question,
"answer": answer,
"context": chunk[:200] + "..." if len(chunk) > 200 else chunk,
"type": q_type
})
questions_generated += 1
except Exception as e:
print(f"Error generating QA: {e}")
return flashcards
def process_pdf(pdf_file, questions_per_chunk: int = 3, max_chunks: int = 20):
"""Main processing function."""
if pdf_file is None:
return "Please upload a PDF file.", "", "", "Your flashcards will appear here..."
try:
# Extract text
yield "πŸ“„ Extracting text from PDF...", "", "", "Processing..."
raw_text = extract_text_from_pdf(pdf_file)
if raw_text.startswith("Error"):
yield raw_text, "", "", "Error occurred"
return
if len(raw_text.strip()) < 100:
yield "PDF appears to be empty or contains no extractable text.", "", "", "Error occurred"
return
# Clean text
yield "🧹 Cleaning text...", "", "", "Processing..."
cleaned_text = clean_text(raw_text)
# Chunk text
yield "βœ‚οΈ Chunking text into sections...", "", "", "Processing..."
chunks = chunk_text(cleaned_text)
# Limit chunks for CPU performance
chunks = chunks[:max_chunks]
# Generate flashcards
all_flashcards = []
total_chunks = len(chunks)
for i, chunk in enumerate(chunks):
progress = f"🎴 Generating flashcards... ({i+1}/{total_chunks} chunks processed)"
yield progress, "", "", "Processing..."
cards = generate_qa_pairs(chunk, questions_per_chunk)
all_flashcards.extend(cards)
if not all_flashcards:
yield "Could not generate flashcards from this PDF. Try a PDF with more textual content.", "", "", "No flashcards generated"
return
# Format output
yield "βœ… Finalizing...", "", "", "Almost done..."
# Create formatted display
display_text = format_flashcards_display(all_flashcards)
# Create JSON download
json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False)
# Create Anki/CSV format
csv_lines = ["Question,Answer,Type"]
for card in all_flashcards:
q = card['question'].replace('"', '""')
a = card['answer'].replace('"', '""')
t = card.get('type', 'what')
csv_lines.append(f'"{q}","{a}","{t}"')
csv_output = "\n".join(csv_lines)
# FINAL OUTPUT - this updates all components
stats = f"βœ… Done! Generated {len(all_flashcards)} flashcards ("
types_count = {}
for card in all_flashcards:
t = card.get('type', 'what')
types_count[t] = types_count.get(t, 0) + 1
stats += ", ".join([f"{count} {qtype}" for qtype, count in types_count.items()]) + ")"
yield stats, csv_output, json_output, display_text
except Exception as e:
error_msg = f"Error processing PDF: {str(e)}"
print(error_msg)
yield error_msg, "", "", error_msg
def format_flashcards_display(flashcards: List[Dict]) -> str:
"""Format flashcards for nice display."""
lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"]
# Count by type
types_count = {}
for card in flashcards:
t = card.get('type', 'what')
types_count[t] = types_count.get(t, 0) + 1
lines.append(f"**Breakdown:** {', '.join([f'{count} {qtype.upper()}' for qtype, count in types_count.items()])}\n")
lines.append("---\n")
for i, card in enumerate(flashcards, 1):
qtype = card.get('type', 'what').upper()
emoji = "❓" if qtype == "WHAT" else "πŸ€”" if qtype == "WHY" else "πŸ”§"
lines.append(f"### {emoji} Card {i} - {qtype}")
lines.append(f"**Q:** {card['question']}")
lines.append(f"**A:** {card['answer']}")
lines.append(f"*Context: {card['context'][:100]}...*\n")
lines.append("---\n")
return "\n".join(lines)
def create_sample_flashcard():
"""Create a sample flashcard for demo purposes."""
sample = [
{
"question": "What is photosynthesis?",
"answer": "Photosynthesis is the process by which plants convert sunlight into energy.",
"context": "Photosynthesis is the process by which plants convert sunlight into energy...",
"type": "what"
},
{
"question": "Why do plants need chlorophyll?",
"answer": "Chlorophyll absorbs light energy needed for photosynthesis.",
"context": "Chlorophyll absorbs light energy needed for photosynthesis...",
"type": "why"
},
{
"question": "How do plants convert light into chemical energy?",
"answer": "Through the process of photosynthesis in the chloroplasts.",
"context": "Through the process of photosynthesis in the chloroplasts...",
"type": "how"
}
]
return format_flashcards_display(sample)
# Custom CSS for better styling
custom_css = """
.flashcard-container {
border: 2px solid #e0e0e0;
border-radius: 10px;
padding: 20px;
margin: 10px 0;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
}
.question {
font-size: 1.2em;
font-weight: bold;
margin-bottom: 10px;
}
.answer {
font-size: 1em;
opacity: 0.9;
}
"""
# Gradio Interface
with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
gr.Markdown("""
# πŸ“š PDF to Flashcards Generator (Enhanced)
Upload any PDF document and automatically generate study flashcards with **What, Why, and How** questions using AI.
**✨ New Features:**
- 🎯 Generates **What** questions (factual)
- πŸ€” Generates **Why** questions (reasoning)
- πŸ”§ Generates **How** questions (process)
- πŸ“Š Improved question quality and variety
- 🧠 Better answer extraction
**Core Features:**
- 🧠 Uses local CPU-friendly AI (no GPU needed)
- πŸ“„ Extracts text from any PDF
- βœ‚οΈ Intelligently chunks content
- 🎴 Generates diverse question-answer pairs
- πŸ’Ύ Export to CSV (Anki-compatible) or JSON
*Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.*
""")
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="Upload PDF",
file_types=[".pdf"],
type="filepath"
)
with gr.Row():
questions_per_chunk = gr.Slider(
minimum=1,
maximum=6,
value=3,
step=1,
label="Questions per section"
)
max_chunks = gr.Slider(
minimum=5,
maximum=50,
value=20,
step=5,
label="Max sections to process"
)
process_btn = gr.Button("πŸš€ Generate Flashcards", variant="primary")
gr.Markdown("""
### πŸ’‘ Tips:
- Text-based PDFs work best (scanned images won't work)
- Academic papers and articles work great
- Adjust "Questions per section" for more variety
- Higher questions per section = more Why/How questions
""")
with gr.Column(scale=2):
status_text = gr.Textbox(
label="Status",
value="Ready to process PDF...",
interactive=False
)
output_display = gr.Markdown(
label="Generated Flashcards",
value="Your flashcards will appear here..."
)
with gr.Row():
with gr.Column():
csv_output = gr.Textbox(
label="CSV Format (for Anki import)",
lines=10,
visible=True
)
gr.Markdown("*Copy the CSV content and save as `.csv` file to import into Anki*")
with gr.Column():
json_output = gr.Textbox(
label="JSON Format",
lines=10,
visible=True
)
gr.Markdown("*Raw JSON data for custom applications*")
# Direct binding
process_btn.click(
fn=process_pdf,
inputs=[pdf_input, questions_per_chunk, max_chunks],
outputs=[status_text, csv_output, json_output, output_display]
)
# Example section
gr.Markdown("---")
gr.Markdown("### 🎯 Example Output Format")
gr.Markdown(create_sample_flashcard())
if __name__ == "__main__":
demo.launch()