Spaces:

heerjtdev
/

example

Sleeping

App Files Files Community

heerjtdev commited on Feb 12

Commit

d818498

verified ·

1 Parent(s): 3eeedea

Update app.py

Browse files

Files changed (1) hide show

app.py +575 -31

app.py CHANGED Viewed

@@ -1,3 +1,374 @@
 import gradio as gr
 import PyPDF2
 import re
@@ -20,8 +391,33 @@ model.eval()
 device = torch.device("cpu")
 model.to(device)
-def generate_questions(context: str, answer: str, max_length: int = 128) -> str:
-    """Generate a question using T5 model."""
     try:
         # Format: "generate question: <hl> answer <hl> context"
         input_text = f"generate question: <hl> {answer} <hl> {context}"
@@ -35,15 +431,19 @@ def generate_questions(context: str, answer: str, max_length: int = 128) -> str:
             padding=True
         ).to(device)
         # Generate
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
                 max_length=max_length,
-                num_beams=4,
                 early_stopping=True,
                 do_sample=True,
-                temperature=0.7
             )
         # Decode
@@ -52,12 +452,97 @@ def generate_questions(context: str, answer: str, max_length: int = 128) -> str:
         # Clean up
         question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip()
         return question if len(question) > 10 else ""
     except Exception as e:
         print(f"Error generating question: {e}")
         return ""
 def extract_text_from_pdf(pdf_file) -> str:
     """Extract text from uploaded PDF file."""
     text = ""
@@ -112,8 +597,8 @@ def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[
     return overlapped_chunks
-def generate_qa_pairs(chunk: str, num_questions: int = 2) -> List[Dict[str, str]]:
-    """Generate question-answer pairs from a text chunk."""
     flashcards = []
     # Skip chunks that are too short
@@ -122,35 +607,51 @@ def generate_qa_pairs(chunk: str, num_questions: int = 2) -> List[Dict[str, str]
         return []
     try:
-        # Split into sentences to use as answers
         sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20]
-        if len(sentences) < 1:
             return []
-        # Generate questions for different sentences
-        for i in range(min(num_questions, len(sentences))):
-            answer = sentences[i]
             # Skip very short answers
             if len(answer.split()) < 3:
                 continue
-            question = generate_questions(chunk, answer)
             if question and question != answer:  # Make sure they're different
                 flashcards.append({
                     "question": question,
                     "answer": answer,
-                    "context": chunk[:200] + "..." if len(chunk) > 200 else chunk
                 })
     except Exception as e:
         print(f"Error generating QA: {e}")
     return flashcards
-def process_pdf(pdf_file, questions_per_chunk: int = 2, max_chunks: int = 20):
     """Main processing function."""
     if pdf_file is None:
         return "Please upload a PDF file.", "", "", "Your flashcards will appear here..."
@@ -204,15 +705,23 @@ def process_pdf(pdf_file, questions_per_chunk: int = 2, max_chunks: int = 20):
         json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False)
         # Create Anki/CSV format
-        csv_lines = ["Question,Answer"]
         for card in all_flashcards:
             q = card['question'].replace('"', '""')
             a = card['answer'].replace('"', '""')
-            csv_lines.append(f'"{q}","{a}"')
         csv_output = "\n".join(csv_lines)
         # FINAL OUTPUT - this updates all components
-        yield "✅ Done! Generated {} flashcards".format(len(all_flashcards)), csv_output, json_output, display_text
     except Exception as e:
         error_msg = f"Error processing PDF: {str(e)}"
@@ -223,8 +732,20 @@ def format_flashcards_display(flashcards: List[Dict]) -> str:
     """Format flashcards for nice display."""
     lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"]
     for i, card in enumerate(flashcards, 1):
-        lines.append(f"### Card {i}")
         lines.append(f"**Q:** {card['question']}")
         lines.append(f"**A:** {card['answer']}")
         lines.append(f"*Context: {card['context'][:100]}...*\n")
@@ -234,11 +755,26 @@ def format_flashcards_display(flashcards: List[Dict]) -> str:
 def create_sample_flashcard():
     """Create a sample flashcard for demo purposes."""
-    sample = [{
-        "question": "What is the capital of France?",
-        "answer": "Paris is the capital and most populous city of France.",
-        "context": "Paris is the capital and most populous city of France..."
-    }]
     return format_flashcards_display(sample)
 # Custom CSS for better styling
@@ -265,15 +801,22 @@ custom_css = """
 # Gradio Interface
 with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
     gr.Markdown("""
-    # 📚 PDF to Flashcards Generator
-    Upload any PDF document and automatically generate study flashcards (Q&A pairs) using AI.
-    **Features:**
     - 🧠 Uses local CPU-friendly AI (no GPU needed)
     - 📄 Extracts text from any PDF
     - ✂️ Intelligently chunks content
-    - 🎴 Generates question-answer pairs
     - 💾 Export to CSV (Anki-compatible) or JSON
     *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.*
@@ -290,8 +833,8 @@ with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
             with gr.Row():
                 questions_per_chunk = gr.Slider(
                     minimum=1,
-                    maximum=5,
-                    value=2,
                     step=1,
                     label="Questions per section"
                 )
@@ -309,7 +852,8 @@ with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
             ### 💡 Tips:
             - Text-based PDFs work best (scanned images won't work)
             - Academic papers and articles work great
-            - Adjust "Questions per section" based on content density
             """)
         with gr.Column(scale=2):
@@ -341,7 +885,7 @@ with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
             )
             gr.Markdown("*Raw JSON data for custom applications*")
-    # FIXED: Direct binding without the broken .then() chain
     process_btn.click(
         fn=process_pdf,
         inputs=[pdf_input, questions_per_chunk, max_chunks],

+# import gradio as gr
+# import PyPDF2
+# import re
+# import json
+# from typing import List, Dict
+# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# import torch
+# import tempfile
+# import os
+# # Initialize the model and tokenizer directly
+# print("Loading models... This may take a minute on first run.")
+# model_name = "valhalla/t5-small-qg-hl"
+# tokenizer = AutoTokenizer.from_pretrained(model_name)
+# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# # Set to evaluation mode and CPU
+# model.eval()
+# device = torch.device("cpu")
+# model.to(device)
+# def generate_questions(context: str, answer: str, max_length: int = 128) -> str:
+#     """Generate a question using T5 model."""
+#     try:
+#         # Format: "generate question: <hl> answer <hl> context"
+#         input_text = f"generate question: <hl> {answer} <hl> {context}"
+#         # Tokenize
+#         inputs = tokenizer(
+#             input_text,
+#             return_tensors="pt",
+#             max_length=512,
+#             truncation=True,
+#             padding=True
+#         ).to(device)
+#         # Generate
+#         with torch.no_grad():
+#             outputs = model.generate(
+#                 **inputs,
+#                 max_length=max_length,
+#                 num_beams=4,
+#                 early_stopping=True,
+#                 do_sample=True,
+#                 temperature=0.7
+#             )
+#         # Decode
+#         question = tokenizer.decode(outputs[0], skip_special_tokens=True)
+#         # Clean up
+#         question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip()
+#         return question if len(question) > 10 else ""
+#     except Exception as e:
+#         print(f"Error generating question: {e}")
+#         return ""
+# def extract_text_from_pdf(pdf_file) -> str:
+#     """Extract text from uploaded PDF file."""
+#     text = ""
+#     try:
+#         if isinstance(pdf_file, str):
+#             pdf_reader = PyPDF2.PdfReader(pdf_file)
+#         else:
+#             pdf_reader = PyPDF2.PdfReader(pdf_file)
+#         for page in pdf_reader.pages:
+#             page_text = page.extract_text()
+#             if page_text:
+#                 text += page_text + "\n"
+#     except Exception as e:
+#         return f"Error reading PDF: {str(e)}"
+#     return text
+# def clean_text(text: str) -> str:
+#     """Clean and preprocess extracted text."""
+#     # Remove excessive whitespace
+#     text = re.sub(r'\s+', ' ', text)
+#     # Remove special characters but keep sentence structure
+#     text = re.sub(r'[^\w\s.,;!?-]', '', text)
+#     return text.strip()
+# def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[str]:
+#     """Split text into overlapping chunks for processing."""
+#     sentences = re.split(r'(?<=[.!?])\s+', text)
+#     chunks = []
+#     current_chunk = ""
+#     for sentence in sentences:
+#         if len(current_chunk) + len(sentence) < max_chunk_size:
+#             current_chunk += " " + sentence
+#         else:
+#             if current_chunk:
+#                 chunks.append(current_chunk.strip())
+#             current_chunk = sentence
+#     if current_chunk:
+#         chunks.append(current_chunk.strip())
+#     # Add overlap between chunks for context
+#     overlapped_chunks = []
+#     for i, chunk in enumerate(chunks):
+#         if i > 0 and overlap > 0:
+#             prev_sentences = chunks[i-1].split('. ')
+#             overlap_text = '. '.join(prev_sentences[-2:]) if len(prev_sentences) > 1 else chunks[i-1][-overlap:]
+#             chunk = overlap_text + " " + chunk
+#         overlapped_chunks.append(chunk)
+#     return overlapped_chunks
+# def generate_qa_pairs(chunk: str, num_questions: int = 2) -> List[Dict[str, str]]:
+#     """Generate question-answer pairs from a text chunk."""
+#     flashcards = []
+#     # Skip chunks that are too short
+#     words = chunk.split()
+#     if len(words) < 20:
+#         return []
+#     try:
+#         # Split into sentences to use as answers
+#         sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20]
+#         if len(sentences) < 1:
+#             return []
+#         # Generate questions for different sentences
+#         for i in range(min(num_questions, len(sentences))):
+#             answer = sentences[i]
+#             # Skip very short answers
+#             if len(answer.split()) < 3:
+#                 continue
+#             question = generate_questions(chunk, answer)
+#             if question and question != answer:  # Make sure they're different
+#                 flashcards.append({
+#                     "question": question,
+#                     "answer": answer,
+#                     "context": chunk[:200] + "..." if len(chunk) > 200 else chunk
+#                 })
+#     except Exception as e:
+#         print(f"Error generating QA: {e}")
+#     return flashcards
+# def process_pdf(pdf_file, questions_per_chunk: int = 2, max_chunks: int = 20):
+#     """Main processing function."""
+#     if pdf_file is None:
+#         return "Please upload a PDF file.", "", "", "Your flashcards will appear here..."
+#     try:
+#         # Extract text
+#         yield "📄 Extracting text from PDF...", "", "", "Processing..."
+#         raw_text = extract_text_from_pdf(pdf_file)
+#         if raw_text.startswith("Error"):
+#             yield raw_text, "", "", "Error occurred"
+#             return
+#         if len(raw_text.strip()) < 100:
+#             yield "PDF appears to be empty or contains no extractable text.", "", "", "Error occurred"
+#             return
+#         # Clean text
+#         yield "🧹 Cleaning text...", "", "", "Processing..."
+#         cleaned_text = clean_text(raw_text)
+#         # Chunk text
+#         yield "✂️ Chunking text into sections...", "", "", "Processing..."
+#         chunks = chunk_text(cleaned_text)
+#         # Limit chunks for CPU performance
+#         chunks = chunks[:max_chunks]
+#         # Generate flashcards
+#         all_flashcards = []
+#         total_chunks = len(chunks)
+#         for i, chunk in enumerate(chunks):
+#             progress = f"🎴 Generating flashcards... ({i+1}/{total_chunks} chunks processed)"
+#             yield progress, "", "", "Processing..."
+#             cards = generate_qa_pairs(chunk, questions_per_chunk)
+#             all_flashcards.extend(cards)
+#         if not all_flashcards:
+#             yield "Could not generate flashcards from this PDF. Try a PDF with more textual content.", "", "", "No flashcards generated"
+#             return
+#         # Format output
+#         yield "✅ Finalizing...", "", "", "Almost done..."
+#         # Create formatted display
+#         display_text = format_flashcards_display(all_flashcards)
+#         # Create JSON download
+#         json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False)
+#         # Create Anki/CSV format
+#         csv_lines = ["Question,Answer"]
+#         for card in all_flashcards:
+#             q = card['question'].replace('"', '""')
+#             a = card['answer'].replace('"', '""')
+#             csv_lines.append(f'"{q}","{a}"')
+#         csv_output = "\n".join(csv_lines)
+#         # FINAL OUTPUT - this updates all components
+#         yield "✅ Done! Generated {} flashcards".format(len(all_flashcards)), csv_output, json_output, display_text
+#     except Exception as e:
+#         error_msg = f"Error processing PDF: {str(e)}"
+#         print(error_msg)
+#         yield error_msg, "", "", error_msg
+# def format_flashcards_display(flashcards: List[Dict]) -> str:
+#     """Format flashcards for nice display."""
+#     lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"]
+#     for i, card in enumerate(flashcards, 1):
+#         lines.append(f"### Card {i}")
+#         lines.append(f"**Q:** {card['question']}")
+#         lines.append(f"**A:** {card['answer']}")
+#         lines.append(f"*Context: {card['context'][:100]}...*\n")
+#         lines.append("---\n")
+#     return "\n".join(lines)
+# def create_sample_flashcard():
+#     """Create a sample flashcard for demo purposes."""
+#     sample = [{
+#         "question": "What is the capital of France?",
+#         "answer": "Paris is the capital and most populous city of France.",
+#         "context": "Paris is the capital and most populous city of France..."
+#     }]
+#     return format_flashcards_display(sample)
+# # Custom CSS for better styling
+# custom_css = """
+# .flashcard-container {
+#     border: 2px solid #e0e0e0;
+#     border-radius: 10px;
+#     padding: 20px;
+#     margin: 10px 0;
+#     background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+#     color: white;
+# }
+# .question {
+#     font-size: 1.2em;
+#     font-weight: bold;
+#     margin-bottom: 10px;
+# }
+# .answer {
+#     font-size: 1em;
+#     opacity: 0.9;
+# }
+# """
+# # Gradio Interface
+# with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
+#     gr.Markdown("""
+#     # 📚 PDF to Flashcards Generator
+#     Upload any PDF document and automatically generate study flashcards (Q&A pairs) using AI.
+#     **Features:**
+#     - 🧠 Uses local CPU-friendly AI (no GPU needed)
+#     - 📄 Extracts text from any PDF
+#     - ✂️ Intelligently chunks content
+#     - 🎴 Generates question-answer pairs
+#     - 💾 Export to CSV (Anki-compatible) or JSON
+#     *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.*
+#     """)
+#     with gr.Row():
+#         with gr.Column(scale=1):
+#             pdf_input = gr.File(
+#                 label="Upload PDF",
+#                 file_types=[".pdf"],
+#                 type="filepath"
+#             )
+#             with gr.Row():
+#                 questions_per_chunk = gr.Slider(
+#                     minimum=1,
+#                     maximum=5,
+#                     value=2,
+#                     step=1,
+#                     label="Questions per section"
+#                 )
+#                 max_chunks = gr.Slider(
+#                     minimum=5,
+#                     maximum=50,
+#                     value=20,
+#                     step=5,
+#                     label="Max sections to process"
+#                 )
+#             process_btn = gr.Button("🚀 Generate Flashcards", variant="primary")
+#             gr.Markdown("""
+#             ### 💡 Tips:
+#             - Text-based PDFs work best (scanned images won't work)
+#             - Academic papers and articles work great
+#             - Adjust "Questions per section" based on content density
+#             """)
+#         with gr.Column(scale=2):
+#             status_text = gr.Textbox(
+#                 label="Status",
+#                 value="Ready to process PDF...",
+#                 interactive=False
+#             )
+#             output_display = gr.Markdown(
+#                 label="Generated Flashcards",
+#                 value="Your flashcards will appear here..."
+#             )
+#     with gr.Row():
+#         with gr.Column():
+#             csv_output = gr.Textbox(
+#                 label="CSV Format (for Anki import)",
+#                 lines=10,
+#                 visible=True
+#             )
+#             gr.Markdown("*Copy the CSV content and save as `.csv` file to import into Anki*")
+#         with gr.Column():
+#             json_output = gr.Textbox(
+#                 label="JSON Format",
+#                 lines=10,
+#                 visible=True
+#             )
+#             gr.Markdown("*Raw JSON data for custom applications*")
+#     # FIXED: Direct binding without the broken .then() chain
+#     process_btn.click(
+#         fn=process_pdf,
+#         inputs=[pdf_input, questions_per_chunk, max_chunks],
+#         outputs=[status_text, csv_output, json_output, output_display]
+#     )
+#     # Example section
+#     gr.Markdown("---")
+#     gr.Markdown("### 🎯 Example Output Format")
+#     gr.Markdown(create_sample_flashcard())
+# if __name__ == "__main__":
+#     demo.launch()
 import gradio as gr
 import PyPDF2
 import re
 device = torch.device("cpu")
 model.to(device)
+def extract_key_phrases(text: str) -> List[str]:
+    """Extract potential answer candidates from text."""
+    # Look for noun phrases, named entities, and important concepts
+    candidates = []
+    # Pattern for capitalized words/phrases (potential named entities)
+    capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
+    candidates.extend(capitalized[:3])
+    # Pattern for technical terms or concepts (words with specific patterns)
+    # Look for phrases like "the process of X", "the concept of X", etc.
+    concept_patterns = [
+        r'(?:process|method|technique|approach|concept|theory|principle|system) of ([^,.]{10,50})',
+        r'(?:known as|called|termed|referred to as) ([^,.]{5,40})',
+        r'(?:is|are|was|were) (\w+(?:\s+\w+){1,4}) (?:that|which|who)',
+    ]
+    for pattern in concept_patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        candidates.extend(matches[:2])
+    # Clean and deduplicate
+    candidates = [c.strip() for c in candidates if len(c.strip()) > 5]
+    return list(dict.fromkeys(candidates))[:5]  # Remove duplicates, keep order
+def generate_questions(context: str, answer: str, question_type: str = "what", max_length: int = 128) -> str:
+    """Generate a question using T5 model with specified type."""
     try:
         # Format: "generate question: <hl> answer <hl> context"
         input_text = f"generate question: <hl> {answer} <hl> {context}"
             padding=True
         ).to(device)
+        # Generate with different parameters based on question type
+        temperature = 0.7 if question_type == "what" else 0.85
+        num_beams = 4 if question_type == "what" else 5
         # Generate
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
                 max_length=max_length,
+                num_beams=num_beams,
                 early_stopping=True,
                 do_sample=True,
+                temperature=temperature
             )
         # Decode
         # Clean up
         question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip()
+        # Post-process to improve question quality
+        question = improve_question(question, answer, context, question_type)
         return question if len(question) > 10 else ""
     except Exception as e:
         print(f"Error generating question: {e}")
         return ""
+def improve_question(question: str, answer: str, context: str, question_type: str) -> str:
+    """Post-process generated questions to improve quality and add variety."""
+    # Ensure question ends with question mark
+    if not question.endswith('?'):
+        question = question.rstrip('.') + '?'
+    # Capitalize first letter
+    question = question[0].upper() + question[1:] if question else question
+    # Try to transform to why/how questions if specified
+    if question_type == "why" and not question.lower().startswith("why"):
+        # Try to convert to why question
+        if re.search(r'\b(is|are|was|were|does|do|did)\b', question, re.IGNORECASE):
+            question = create_why_question(question, answer, context)
+    elif question_type == "how" and not question.lower().startswith("how"):
+        # Try to convert to how question
+        if re.search(r'\b(does|do|did|can|could)\b', question, re.IGNORECASE):
+            question = create_how_question(question, answer, context)
+    return question
+def create_why_question(base_question: str, answer: str, context: str) -> str:
+    """Transform or create a 'why' question."""
+    # Look for causal indicators in the context
+    causal_patterns = [
+        r'because ([^,.]{10,60})',
+        r'due to ([^,.]{10,60})',
+        r'as a result of ([^,.]{10,60})',
+        r'(?:leads to|causes|results in) ([^,.]{10,60})',
+        r'in order to ([^,.]{10,60})'
+    ]
+    for pattern in causal_patterns:
+        match = re.search(pattern, context, re.IGNORECASE)
+        if match:
+            # Extract the subject from context
+            subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+(?:is|are|was|were|does|do)', context)
+            if subject_match:
+                subject = subject_match.group(1)
+                return f"Why does {subject.lower()} occur?"
+    # Fallback: create a generic why question
+    # Extract main subject from answer
+    words = answer.split()
+    if len(words) > 3:
+        return f"Why is {' '.join(words[:4])}... important?"
+    return base_question
+def create_how_question(base_question: str, answer: str, context: str) -> str:
+    """Transform or create a 'how' question."""
+    # Look for process indicators
+    process_patterns = [
+        r'(process|method|procedure|technique|approach) (?:of|for|to) ([^,.]{10,60})',
+        r'by ([^,.]{10,60})',
+        r'through ([^,.]{10,60})'
+    ]
+    for pattern in process_patterns:
+        match = re.search(pattern, context, re.IGNORECASE)
+        if match:
+            if len(match.groups()) > 1:
+                process = match.group(2)
+                return f"How does {process.lower()} work?"
+            else:
+                process = match.group(1)
+                return f"How is {process.lower()} achieved?"
+    # Fallback: create a generic how question
+    verbs = re.findall(r'\b(works?|functions?|operates?|performs?|executes?)\b', context, re.IGNORECASE)
+    if verbs:
+        subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+' + verbs[0], context, re.IGNORECASE)
+        if subject_match:
+            subject = subject_match.group(1)
+            return f"How does {subject.lower()} {verbs[0].lower()}?"
+    return base_question
 def extract_text_from_pdf(pdf_file) -> str:
     """Extract text from uploaded PDF file."""
     text = ""
     return overlapped_chunks
+def generate_qa_pairs(chunk: str, num_questions: int = 3) -> List[Dict[str, str]]:
+    """Generate question-answer pairs from a text chunk with variety."""
     flashcards = []
     # Skip chunks that are too short
         return []
     try:
+        # Extract key phrases for answers
+        key_phrases = extract_key_phrases(chunk)
+        # Also use sentences as potential answers
         sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20]
+        # Combine both sources
+        answer_candidates = key_phrases + sentences[:2]
+        if len(answer_candidates) < 1:
             return []
+        # Define question types to generate
+        question_types = ["what", "why", "how"]
+        # Generate diverse questions
+        questions_generated = 0
+        for i, answer in enumerate(answer_candidates):
+            if questions_generated >= num_questions:
+                break
             # Skip very short answers
             if len(answer.split()) < 3:
                 continue
+            # Cycle through question types
+            q_type = question_types[i % len(question_types)]
+            question = generate_questions(chunk, answer, question_type=q_type)
             if question and question != answer:  # Make sure they're different
                 flashcards.append({
                     "question": question,
                     "answer": answer,
+                    "context": chunk[:200] + "..." if len(chunk) > 200 else chunk,
+                    "type": q_type
                 })
+                questions_generated += 1
     except Exception as e:
         print(f"Error generating QA: {e}")
     return flashcards
+def process_pdf(pdf_file, questions_per_chunk: int = 3, max_chunks: int = 20):
     """Main processing function."""
     if pdf_file is None:
         return "Please upload a PDF file.", "", "", "Your flashcards will appear here..."
         json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False)
         # Create Anki/CSV format
+        csv_lines = ["Question,Answer,Type"]
         for card in all_flashcards:
             q = card['question'].replace('"', '""')
             a = card['answer'].replace('"', '""')
+            t = card.get('type', 'what')
+            csv_lines.append(f'"{q}","{a}","{t}"')
         csv_output = "\n".join(csv_lines)
         # FINAL OUTPUT - this updates all components
+        stats = f"✅ Done! Generated {len(all_flashcards)} flashcards ("
+        types_count = {}
+        for card in all_flashcards:
+            t = card.get('type', 'what')
+            types_count[t] = types_count.get(t, 0) + 1
+        stats += ", ".join([f"{count} {qtype}" for qtype, count in types_count.items()]) + ")"
+        yield stats, csv_output, json_output, display_text
     except Exception as e:
         error_msg = f"Error processing PDF: {str(e)}"
     """Format flashcards for nice display."""
     lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"]
+    # Count by type
+    types_count = {}
+    for card in flashcards:
+        t = card.get('type', 'what')
+        types_count[t] = types_count.get(t, 0) + 1
+    lines.append(f"**Breakdown:** {', '.join([f'{count} {qtype.upper()}' for qtype, count in types_count.items()])}\n")
+    lines.append("---\n")
     for i, card in enumerate(flashcards, 1):
+        qtype = card.get('type', 'what').upper()
+        emoji = "❓" if qtype == "WHAT" else "🤔" if qtype == "WHY" else "🔧"
+        lines.append(f"### {emoji} Card {i} - {qtype}")
         lines.append(f"**Q:** {card['question']}")
         lines.append(f"**A:** {card['answer']}")
         lines.append(f"*Context: {card['context'][:100]}...*\n")
 def create_sample_flashcard():
     """Create a sample flashcard for demo purposes."""
+    sample = [
+        {
+            "question": "What is photosynthesis?",
+            "answer": "Photosynthesis is the process by which plants convert sunlight into energy.",
+            "context": "Photosynthesis is the process by which plants convert sunlight into energy...",
+            "type": "what"
+        },
+        {
+            "question": "Why do plants need chlorophyll?",
+            "answer": "Chlorophyll absorbs light energy needed for photosynthesis.",
+            "context": "Chlorophyll absorbs light energy needed for photosynthesis...",
+            "type": "why"
+        },
+        {
+            "question": "How do plants convert light into chemical energy?",
+            "answer": "Through the process of photosynthesis in the chloroplasts.",
+            "context": "Through the process of photosynthesis in the chloroplasts...",
+            "type": "how"
+        }
+    ]
     return format_flashcards_display(sample)
 # Custom CSS for better styling
 # Gradio Interface
 with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
     gr.Markdown("""
+    # 📚 PDF to Flashcards Generator (Enhanced)
+    Upload any PDF document and automatically generate study flashcards with **What, Why, and How** questions using AI.
+    **✨ New Features:**
+    - 🎯 Generates **What** questions (factual)
+    - 🤔 Generates **Why** questions (reasoning)
+    - 🔧 Generates **How** questions (process)
+    - 📊 Improved question quality and variety
+    - 🧠 Better answer extraction
+    **Core Features:**
     - 🧠 Uses local CPU-friendly AI (no GPU needed)
     - 📄 Extracts text from any PDF
     - ✂️ Intelligently chunks content
+    - 🎴 Generates diverse question-answer pairs
     - 💾 Export to CSV (Anki-compatible) or JSON
     *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.*
             with gr.Row():
                 questions_per_chunk = gr.Slider(
                     minimum=1,
+                    maximum=6,
+                    value=3,
                     step=1,
                     label="Questions per section"
                 )
             ### 💡 Tips:
             - Text-based PDFs work best (scanned images won't work)
             - Academic papers and articles work great
+            - Adjust "Questions per section" for more variety
+            - Higher questions per section = more Why/How questions
             """)
         with gr.Column(scale=2):
             )
             gr.Markdown("*Raw JSON data for custom applications*")
+    # Direct binding
     process_btn.click(
         fn=process_pdf,
         inputs=[pdf_input, questions_per_chunk, max_chunks],