Spaces:

heerjtdev
/

example

Sleeping

App Files Files Community

heerjtdev commited on 2 days ago

Commit

c1ea31c

verified ·

1 Parent(s): acf77a8

Update app.py

Browse files

Files changed (1) hide show

app.py +282 -373

app.py CHANGED Viewed

@@ -1,415 +1,324 @@
 import gradio as gr
-import fitz
-import torch
-import os
 import re
-import numpy as np
-from collections import Counter
-import onnxruntime as ort
-from onnxruntime import SessionOptions, GraphOptimizationLevel
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import FAISS
-from langchain_core.embeddings import Embeddings
-from transformers import AutoTokenizer
-from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForCausalLM
-from huggingface_hub import snapshot_download
-from sentence_transformers import SentenceTransformer  # Add this for cross-encoder
-PROVIDERS = ["CPUExecutionProvider"]
-# ---------------------------------------------------------
-# 1. EMBEDDINGS (Your existing code - good)
-# ---------------------------------------------------------
-class OnnxBgeEmbeddings(Embeddings):
-    def __init__(self):
-        model_name = "Xenova/bge-small-en-v1.5"
-        print(f"🔄 Loading Embeddings: {model_name}...")
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.model = ORTModelForFeatureExtraction.from_pretrained(
-            model_name, export=False, provider=PROVIDERS[0]
-        )
-    def _process_batch(self, texts):
-        inputs = self.tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-        embeddings = outputs.last_hidden_state[:, 0]
-        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
-        return embeddings.numpy().tolist()
-    def embed_documents(self, texts):
-        return self._process_batch(texts)
-    def embed_query(self, text):
-        return self._process_batch([text])[0]
-# ---------------------------------------------------------
-# 2. RULE-BASED GRADING ENGINE (NEW - No LLM needed)
-# ---------------------------------------------------------
-class RuleBasedGrader:
-    """
-    Extracts key concepts from context and checks student answer coverage.
-    Works 100% on CPU, deterministic, explainable.
-    """
-    def __init__(self):
-        # Load a small NER or keyword extraction model if needed
-        # Or use simple TF-IDF/RAKE algorithm
-        pass
-    def extract_key_concepts(self, text, top_k=10):
-        """
-        Extract key noun phrases and important terms from context.
-        Uses simple but effective heuristics.
-        """
-        # Clean text
-        text = re.sub(r'[^\w\s]', ' ', text.lower())
-        words = text.split()
-        # Remove stopwords
-        stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', 'used', 'it', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'we', 'they'}
-        # Get word frequencies (excluding stopwords)
-        words = [w for w in words if w not in stopwords and len(w) > 2]
-        word_freq = Counter(words)
-        # Get bigrams (two-word phrases)
-        bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)]
-        bigram_freq = Counter(bigrams)
-        # Combine unigrams and bigrams
-        concepts = []
-        for word, count in word_freq.most_common(top_k):
-            if count > 1:  # Only include words that appear multiple times
-                concepts.append(word)
-        for bigram, count in bigram_freq.most_common(top_k//2):
-            if count > 1:
-                concepts.append(bigram)
-        return list(set(concepts))[:top_k]  # Remove duplicates, limit to top_k
-    def check_concept_coverage(self, student_answer, key_concepts):
-        """
-        Check which key concepts from context appear in student answer.
-        Returns coverage score and missing concepts.
-        """
-        student_lower = student_answer.lower()
-        found_concepts = []
-        missing_concepts = []
-        for concept in key_concepts:
-            # Check for exact match or partial match
-            if concept in student_lower:
-                found_concepts.append(concept)
-            else:
-                # Check for word stems (e.g., "running" matches "run")
-                concept_words = concept.split()
-                if all(any(word in student_lower for word in [cw, cw+'s', cw+'es', cw+'ed', cw+'ing']) for cw in concept_words):
-                    found_concepts.append(concept)
-                else:
-                    missing_concepts.append(concept)
-        coverage = len(found_concepts) / len(key_concepts) if key_concepts else 0
-        return coverage, found_concepts, missing_concepts
-    def detect_contradictions(self, context, student_answer):
-        """
-        Simple contradiction detection using negation patterns.
-        """
-        context_lower = context.lower()
-        answer_lower = student_answer.lower()
-        # Common negation patterns
-        negation_words = ['not', 'no', 'never', 'none', 'nothing', 'nobody', 'neither', 'nowhere', 'hardly', 'scarcely', 'barely', "doesn't", "isn't", "wasn't", "shouldn't", "wouldn't", "couldn't", "can't", "don't", "didn't", "hasn't", "haven't", "hadn't", "won't"]
-        contradictions = []
-        # Extract sentences from context that contain key facts
-        context_sentences = [s.strip() for s in context.split('.') if len(s.strip()) > 10]
-        for sent in context_sentences:
-            sent_lower = sent.lower()
-            # Check if student says opposite
-            for neg in negation_words:
-                if neg in sent_lower:
-                    # Context has negation, check if student affirms
-                    positive_version = sent_lower.replace(neg, '').strip()
-                    if any(word in answer_lower for word in positive_version.split()[:5]):
-                        contradictions.append(f"Context says: '{sent}' but student contradicts this")
-                else:
-                    # Context is positive, check if student negates
-                    # This is harder - would need semantic understanding
-                    pass
-        return contradictions
-    def calculate_semantic_similarity(self, context, student_answer, embeddings_model):
-        """
-        Use embeddings to calculate semantic similarity.
-        """
-        context_emb = embeddings_model.embed_query(context)
-        answer_emb = embeddings_model.embed_query(student_answer)
-        # Cosine similarity
-        similarity = np.dot(context_emb, answer_emb) / (np.linalg.norm(context_emb) * np.linalg.norm(answer_emb))
-        return float(similarity)
-    def grade(self, context, question, student_answer, max_marks, embeddings_model):
-        """
-        Main grading function combining multiple signals.
-        """
-        # 1. Extract key concepts from context
-        key_concepts = self.extract_key_concepts(context)
-        # 2. Check concept coverage
-        coverage, found, missing = self.check_concept_coverage(student_answer, key_concepts)
-        # 3. Check for contradictions
-        contradictions = self.detect_contradictions(context, student_answer)
-        # 4. Calculate semantic similarity
-        semantic_sim = self.calculate_semantic_similarity(context, student_answer, embeddings_model)
-        # 5. Calculate final score
-        # Weight: 60% concept coverage, 40% semantic similarity
-        # Penalty for contradictions: -50% per contradiction
-        base_score = (coverage * 0.6 + semantic_sim * 0.4) * max_marks
-        # Apply contradiction penalties
-        contradiction_penalty = len(contradictions) * (max_marks * 0.5)
-        final_score = max(0, base_score - contradiction_penalty)
-        # Generate feedback
-        feedback = f"""
-        **Grading Analysis:**
-        **Key Concepts Found ({len(found)}/{len(key_concepts)}):** {', '.join(found) if found else 'None'}
-        **Key Concepts Missing:** {', '.join(missing) if missing else 'None'}
-        **Concept Coverage:** {coverage:.1%}
-        **Semantic Similarity:** {semantic_sim:.1%}
-        **Contradictions Detected:** {len(contradictions)}
-        {chr(10).join(['- ' + c for c in contradictions]) if contradictions else 'None'}
-        **Calculation:** ({coverage:.1%} × 0.6 + {semantic_sim:.1%} × 0.4) × {max_marks} - {contradiction_penalty:.1f} penalty = **{final_score:.1f}/{max_marks}**
-        """
-        return final_score, feedback
-# ---------------------------------------------------------
-# 3. LLM EVALUATOR (Fallback for edge cases)
-# ---------------------------------------------------------
-class LLMEvaluator:
-    def __init__(self):
-        self.repo_id = "onnx-community/Qwen2.5-0.5B-Instruct"
-        self.local_dir = "onnx_qwen_local"
-        if not os.path.exists(self.local_dir):
-            snapshot_download(
-                repo_id=self.repo_id,
-                local_dir=self.local_dir,
-                allow_patterns=["config.json", "generation_config.json", "tokenizer*", "special_tokens_map.json", "*.jinja", "onnx/model_fp16.onnx*"]
-            )
-        self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
-        sess_options = SessionOptions()
-        sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_DISABLE_ALL
-        self.model = ORTModelForCausalLM.from_pretrained(
-            self.local_dir,
-            subfolder="onnx",
-            file_name="model_fp16.onnx",
-            use_cache=True,
-            use_io_binding=False,
-            provider=PROVIDERS[0],
-            session_options=sess_options
-        )
-    def evaluate(self, context, question, student_answer, max_marks, rule_based_score):
-        """
-        Use LLM only for ambiguous cases or to verify edge cases.
-        Simplified prompt for 0.5B model.
-        """
-        # If rule-based gave clear 0 or max, don't bother with LLM
-        if rule_based_score == 0:
-            return "Score: 0/{max_marks}\nFeedback: Answer contains significant errors or contradictions with the reference text."
-        if rule_based_score == max_marks:
-            return "Score: {max_marks}/{max_marks}\nFeedback: Excellent answer that fully covers the reference material."
-        # Otherwise, use LLM for nuanced cases
-        prompt = f"""Grade this answer based ONLY on the context provided.
-Context: {context[:500]}
-Question: {question}
-Student Answer: {student_answer}
-Rules:
-1. Give 0 if answer contradicts context or adds outside information
-2. Give full marks only if answer matches context exactly
-3. Give partial marks for partial matches
-Output exactly:
-Score: X/{max_marks}
-Feedback: One sentence explanation"""
-        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
-        with torch.no_grad():
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=50,
-                temperature=0.1,
-                do_sample=False,
-                pad_token_id=self.tokenizer.eos_token_id
             )
-        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Extract just the generated part (after the prompt)
-        response = response[len(self.tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)):]
-        return response.strip()
-# ---------------------------------------------------------
-# 4. MAIN APPLICATION
-# ---------------------------------------------------------
-class VectorSystem:
-    def __init__(self):
-        self.vector_store = None
-        self.embeddings = OnnxBgeEmbeddings()
-        self.rule_grader = RuleBasedGrader()
-        self.llm = LLMEvaluator()
-        self.all_chunks = []
-        self.total_chunks = 0
-    def process_content(self, file_obj, raw_text):
-        has_file = file_obj is not None
-        has_text = raw_text is not None and len(raw_text.strip()) > 0
-        if has_file and has_text:
-            return "❌ Error: Provide EITHER file OR text, not both."
-        if not has_file and not has_text:
-            return "⚠️ No content provided."
-        try:
-            text = ""
-            if has_file:
-                if file_obj.name.endswith('.pdf'):
-                    doc = fitz.open(file_obj.name)
-                    for page in doc:
-                        text += page.get_text()
-                elif file_obj.name.endswith('.txt'):
-                    with open(file_obj.name, 'r', encoding='utf-8') as f:
-                        text = f.read()
-                else:
-                    return "❌ Only .pdf and .txt supported."
-            else:
-                text = raw_text
-            # Larger chunks for better context
-            text_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=1000,
-                chunk_overlap=200,
-                separators=["\n\n", "\n", ". ", " ", ""]
             )
-            self.all_chunks = text_splitter.split_text(text)
-            self.total_chunks = len(self.all_chunks)
-            if not self.all_chunks:
-                return "Content empty."
-            metadatas = [{"id": i} for i in range(self.total_chunks)]
-            self.vector_store = FAISS.from_texts(
-                self.all_chunks,
-                self.embeddings,
-                metadatas=metadatas
             )
-            return f"✅ Indexed {self.total_chunks} chunks."
-        except Exception as e:
-            return f"Error: {str(e)}"
-    def process_query(self, question, student_answer, max_marks):
-        if not self.vector_store:
-            return "⚠️ Upload content first.", ""
-        if not question:
-            return "⚠️ Enter a question.", ""
-        if not student_answer:
-            return "⚠️ Enter a student answer.", ""
-        # Retrieve relevant context
-        results = self.vector_store.similarity_search_with_score(question, k=2)
-        # Combine top 2 chunks for better context
-        context_parts = []
-        for doc, score in results:
-            context_parts.append(self.all_chunks[doc.metadata['id']])
-        expanded_context = "\n".join(context_parts)
-        # Use rule-based grading (fast, deterministic)
-        score, feedback = self.rule_grader.grade(
-            expanded_context,
-            question,
-            student_answer,
-            max_marks,
-            self.embeddings
-        )
-        # Optional: Use LLM for ambiguous cases (score between 20-80%)
-        # Uncomment if you want LLM verification
-        # if 0.2 < (score/max_marks) < 0.8:
-        #     llm_feedback = self.llm.evaluate(expanded_context, question, student_answer, max_marks, score)
-        #     feedback += f"\n\n**LLM Verification:**\n{llm_feedback}"
-        evidence_display = f"### 📚 Context Used:\n{expanded_context[:800]}..."
-        grade_display = f"### 📝 Grade: {score:.1f}/{max_marks}\n\n{feedback}"
-        return evidence_display, grade_display
-# Initialize and launch
-system = VectorSystem()
-with gr.Blocks(title="EduGenius AI Grader") as demo:
-    gr.Markdown("# ⚡ EduGenius: CPU Optimized RAG")
-    gr.Markdown("Hybrid Rule-Based + LLM Grading (ONNX Optimized)")
     with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("### Source Input")
-            pdf_input = gr.File(label="Upload Chapter (PDF/TXT)")
-            gr.Markdown("**OR**")
-            text_input = gr.Textbox(
-                label="Paste Context",
-                placeholder="Paste text here...",
-                lines=5
             )
-            upload_btn = gr.Button("Index Content", variant="primary")
-            status_msg = gr.Textbox(label="Status", interactive=False)
-        with gr.Column(scale=2):
-            q_input = gr.Textbox(label="Question", scale=2)
-            max_marks = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max Marks")
-            a_input = gr.TextArea(label="Student Answer", lines=5)
-            run_btn = gr.Button("Retrieve & Grade", variant="secondary")
-            with gr.Row():
-                evidence_box = gr.Markdown()
-                grade_box = gr.Markdown()
-    upload_btn.click(
-        system.process_content,
-        inputs=[pdf_input, text_input],
-        outputs=[status_msg]
-    )
-    run_btn.click(
-        system.process_query,
-        inputs=[q_input, a_input, max_marks],
-        outputs=[evidence_box, grade_box]
     )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+import PyPDF2
 import re
+import json
+from typing import List, Dict, Tuple
+from transformers import pipeline
+import tempfile
+import os
+# Initialize the question generation pipeline using a small CPU-friendly model
+print("Loading models... This may take a minute on first run.")
+qa_generator = pipeline(
+    "text2text-generation",
+    model="valhalla/t5-small-qg-hl",
+    tokenizer="valhalla/t5-small-qg-hl",
+    device=-1  # Force CPU
+)
+def extract_text_from_pdf(pdf_file) -> str:
+    """Extract text from uploaded PDF file."""
+    text = ""
+    try:
+        # Handle both file path and file object
+        if isinstance(pdf_file, str):
+            pdf_reader = PyPDF2.PdfReader(pdf_file)
+        else:
+            pdf_reader = PyPDF2.PdfReader(pdf_file)
+        for page in pdf_reader.pages:
+            text += page.extract_text() + "\n"
+    except Exception as e:
+        return f"Error reading PDF: {str(e)}"
+    return text
+def clean_text(text: str) -> str:
+    """Clean and preprocess extracted text."""
+    # Remove excessive whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Remove special characters but keep sentence structure
+    text = re.sub(r'[^\w\s.,;!?-]', '', text)
+    return text.strip()
+def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[str]:
+    """Split text into overlapping chunks for processing."""
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) < max_chunk_size:
+            current_chunk += " " + sentence
+        else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    # Add overlap between chunks for context
+    overlapped_chunks = []
+    for i, chunk in enumerate(chunks):
+        if i > 0 and overlap > 0:
+            prev_sentences = chunks[i-1].split('. ')
+            overlap_text = '. '.join(prev_sentences[-2:]) if len(prev_sentences) > 1 else chunks[i-1][-overlap:]
+            chunk = overlap_text + " " + chunk
+        overlapped_chunks.append(chunk)
+    return overlapped_chunks
+def generate_qa_pairs(chunk: str, num_questions: int = 2) -> List[Dict[str, str]]:
+    """Generate question-answer pairs from a text chunk."""
+    flashcards = []
+    # Skip chunks that are too short
+    if len(chunk.split()) < 20:
+        return []
+    try:
+        # Generate highlight format for T5 question generation
+        # We'll create simple highlight by taking key sentences
+        sentences = chunk.split('. ')
+        if len(sentences) < 2:
+            return []
+        # Generate questions for different parts of the chunk
+        for i in range(min(num_questions, len(sentences))):
+            # Create highlight context
+            highlight = sentences[i]
+            context = chunk
+            # Format for T5: "generate question: <hl> highlight <hl> context"
+            input_text = f"generate question: <hl> {highlight} <hl> {context}"
+            # Generate question
+            outputs = qa_generator(
+                input_text,
+                max_length=128,
+                num_return_sequences=1,
+                do_sample=True,
+                temperature=0.7
+            )
+            question = outputs[0]['generated_text'].strip()
+            # Clean up question
+            question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip()
+            if question and len(question) > 10:
+                flashcards.append({
+                    "question": question,
+                    "answer": highlight.strip(),
+                    "context": context[:200] + "..." if len(context) > 200 else context
+                })
+    except Exception as e:
+        print(f"Error generating QA: {e}")
+    return flashcards
+def process_pdf(pdf_file, questions_per_chunk: int = 2, max_chunks: int = 20):
+    """Main processing function."""
+    if pdf_file is None:
+        return "Please upload a PDF file.", None, None
+    try:
+        # Extract text
+        yield "📄 Extracting text from PDF...", None, None
+        raw_text = extract_text_from_pdf(pdf_file)
+        if raw_text.startswith("Error"):
+            return raw_text, None, None
+        if len(raw_text.strip()) < 100:
+            return "PDF appears to be empty or contains no extractable text.", None, None
+        # Clean text
+        yield "🧹 Cleaning text...", None, None
+        cleaned_text = clean_text(raw_text)
+        # Chunk text
+        yield "✂️ Chunking text into sections...", None, None
+        chunks = chunk_text(cleaned_text)
+        # Limit chunks for CPU performance
+        chunks = chunks[:max_chunks]
+        # Generate flashcards
+        all_flashcards = []
+        total_chunks = len(chunks)
+        for i, chunk in enumerate(chunks):
+            progress = f"🎴 Generating flashcards... ({i+1}/{total_chunks} chunks processed)"
+            yield progress, None, None
+            cards = generate_qa_pairs(chunk, questions_per_chunk)
+            all_flashcards.extend(cards)
+        if not all_flashcards:
+            return "Could not generate flashcards from this PDF. Try a PDF with more textual content.", None, None
+        # Format output
+        yield "✅ Finalizing...", None, None
+        # Create formatted display
+        display_text = format_flashcards_display(all_flashcards)
+        # Create JSON download
+        json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False)
+        # Create Anki/CSV format
+        csv_lines = ["Question,Answer"]
+        for card in all_flashcards:
+            q = card['question'].replace('"', '""')
+            a = card['answer'].replace('"', '""')
+            csv_lines.append(f'"{q}","{a}"')
+        csv_output = "\n".join(csv_lines)
+        return display_text, csv_output, json_output
+    except Exception as e:
+        return f"Error processing PDF: {str(e)}", None, None
+def format_flashcards_display(flashcards: List[Dict]) -> str:
+    """Format flashcards for nice display."""
+    lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"]
+    for i, card in enumerate(flashcards, 1):
+        lines.append(f"### Card {i}")
+        lines.append(f"**Q:** {card['question']}")
+        lines.append(f"**A:** {card['answer']}")
+        lines.append(f"*Context: {card['context'][:100]}...*\n")
+        lines.append("---\n")
+    return "\n".join(lines)
+def create_sample_flashcard():
+    """Create a sample flashcard for demo purposes."""
+    sample = [{
+        "question": "What is the capital of France?",
+        "answer": "Paris is the capital and most populous city of France.",
+        "context": "Paris is the capital and most populous city of France..."
+    }]
+    return format_flashcards_display(sample)
+# Custom CSS for better styling
+custom_css = """
+.flashcard-container {
+    border: 2px solid #e0e0e0;
+    border-radius: 10px;
+    padding: 20px;
+    margin: 10px 0;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+}
+.question {
+    font-size: 1.2em;
+    font-weight: bold;
+    margin-bottom: 10px;
+}
+.answer {
+    font-size: 1em;
+    opacity: 0.9;
+}
+"""
+# Gradio Interface
+with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
+    gr.Markdown("""
+    # 📚 PDF to Flashcards Generator
+    Upload any PDF document and automatically generate study flashcards (Q&A pairs) using AI.
+    **Features:**
+    - 🧠 Uses local CPU-friendly AI (no GPU needed)
+    - 📄 Extracts text from any PDF
+    - ✂️ Intelligently chunks content
+    - 🎴 Generates question-answer pairs
+    - 💾 Export to CSV (Anki-compatible) or JSON
+    *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.*
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            pdf_input = gr.File(
+                label="Upload PDF",
+                file_types=[".pdf"],
+                type="filepath"
             )
+            with gr.Row():
+                questions_per_chunk = gr.Slider(
+                    minimum=1,
+                    maximum=5,
+                    value=2,
+                    step=1,
+                    label="Questions per section"
+                )
+                max_chunks = gr.Slider(
+                    minimum=5,
+                    maximum=50,
+                    value=20,
+                    step=5,
+                    label="Max sections to process"
+                )
+            process_btn = gr.Button("🚀 Generate Flashcards", variant="primary")
+            gr.Markdown("""
+            ### 💡 Tips:
+            - Text-based PDFs work best (scanned images won't work)
+            - Academic papers and articles work great
+            - Adjust "Questions per section" based on content density
+            """)
+        with gr.Column(scale=2):
+            status_text = gr.Textbox(
+                label="Status",
+                value="Ready to process PDF...",
+                interactive=False
             )
+            output_display = gr.Markdown(
+                label="Generated Flashcards",
+                value="Your flashcards will appear here..."
             )
     with gr.Row():
+        with gr.Column():
+            csv_output = gr.Textbox(
+                label="CSV Format (for Anki import)",
+                lines=10,
+                visible=True
             )
+            gr.Markdown("*Copy the CSV content and save as `.csv` file to import into Anki*")
+        with gr.Column():
+            json_output = gr.Textbox(
+                label="JSON Format",
+                lines=10,
+                visible=True
+            )
+            gr.Markdown("*Raw JSON data for custom applications*")
+    # Event handlers
+    process_btn.click(
+        fn=process_pdf,
+        inputs=[pdf_input, questions_per_chunk, max_chunks],
+        outputs=[status_text, csv_output, json_output]
+    ).then(
+        fn=lambda x: x if not isinstance(x, str) or not x.startswith("📄") else gr.update(),
+        inputs=status_text,
+        outputs=output_display
     )
+    # Example section
+    gr.Markdown("---")
+    gr.Markdown("### 🎯 Example Output Format")
+    gr.Markdown(create_sample_flashcard())
 if __name__ == "__main__":
     demo.launch()