# import gradio as gr # import PyPDF2 # import re # import json # from typing import List, Dict # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # import torch # import tempfile # import os # # Initialize the model and tokenizer directly # print("Loading models... This may take a minute on first run.") # model_name = "valhalla/t5-small-qg-hl" # tokenizer = AutoTokenizer.from_pretrained(model_name) # model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # # Set to evaluation mode and CPU # model.eval() # device = torch.device("cpu") # model.to(device) # def generate_questions(context: str, answer: str, max_length: int = 128) -> str: # """Generate a question using T5 model.""" # try: # # Format: "generate question: answer context" # input_text = f"generate question: {answer} {context}" # # Tokenize # inputs = tokenizer( # input_text, # return_tensors="pt", # max_length=512, # truncation=True, # padding=True # ).to(device) # # Generate # with torch.no_grad(): # outputs = model.generate( # **inputs, # max_length=max_length, # num_beams=4, # early_stopping=True, # do_sample=True, # temperature=0.7 # ) # # Decode # question = tokenizer.decode(outputs[0], skip_special_tokens=True) # # Clean up # question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip() # return question if len(question) > 10 else "" # except Exception as e: # print(f"Error generating question: {e}") # return "" # def extract_text_from_pdf(pdf_file) -> str: # """Extract text from uploaded PDF file.""" # text = "" # try: # if isinstance(pdf_file, str): # pdf_reader = PyPDF2.PdfReader(pdf_file) # else: # pdf_reader = PyPDF2.PdfReader(pdf_file) # for page in pdf_reader.pages: # page_text = page.extract_text() # if page_text: # text += page_text + "\n" # except Exception as e: # return f"Error reading PDF: {str(e)}" # return text # def clean_text(text: str) -> str: # """Clean and preprocess extracted text.""" # # Remove excessive whitespace # text = re.sub(r'\s+', ' ', text) # # Remove special characters but keep sentence structure # text = re.sub(r'[^\w\s.,;!?-]', '', text) # return text.strip() # def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[str]: # """Split text into overlapping chunks for processing.""" # sentences = re.split(r'(?<=[.!?])\s+', text) # chunks = [] # current_chunk = "" # for sentence in sentences: # if len(current_chunk) + len(sentence) < max_chunk_size: # current_chunk += " " + sentence # else: # if current_chunk: # chunks.append(current_chunk.strip()) # current_chunk = sentence # if current_chunk: # chunks.append(current_chunk.strip()) # # Add overlap between chunks for context # overlapped_chunks = [] # for i, chunk in enumerate(chunks): # if i > 0 and overlap > 0: # prev_sentences = chunks[i-1].split('. ') # overlap_text = '. '.join(prev_sentences[-2:]) if len(prev_sentences) > 1 else chunks[i-1][-overlap:] # chunk = overlap_text + " " + chunk # overlapped_chunks.append(chunk) # return overlapped_chunks # def generate_qa_pairs(chunk: str, num_questions: int = 2) -> List[Dict[str, str]]: # """Generate question-answer pairs from a text chunk.""" # flashcards = [] # # Skip chunks that are too short # words = chunk.split() # if len(words) < 20: # return [] # try: # # Split into sentences to use as answers # sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20] # if len(sentences) < 1: # return [] # # Generate questions for different sentences # for i in range(min(num_questions, len(sentences))): # answer = sentences[i] # # Skip very short answers # if len(answer.split()) < 3: # continue # question = generate_questions(chunk, answer) # if question and question != answer: # Make sure they're different # flashcards.append({ # "question": question, # "answer": answer, # "context": chunk[:200] + "..." if len(chunk) > 200 else chunk # }) # except Exception as e: # print(f"Error generating QA: {e}") # return flashcards # def process_pdf(pdf_file, questions_per_chunk: int = 2, max_chunks: int = 20): # """Main processing function.""" # if pdf_file is None: # return "Please upload a PDF file.", "", "", "Your flashcards will appear here..." # try: # # Extract text # yield "📄 Extracting text from PDF...", "", "", "Processing..." # raw_text = extract_text_from_pdf(pdf_file) # if raw_text.startswith("Error"): # yield raw_text, "", "", "Error occurred" # return # if len(raw_text.strip()) < 100: # yield "PDF appears to be empty or contains no extractable text.", "", "", "Error occurred" # return # # Clean text # yield "🧹 Cleaning text...", "", "", "Processing..." # cleaned_text = clean_text(raw_text) # # Chunk text # yield "✂️ Chunking text into sections...", "", "", "Processing..." # chunks = chunk_text(cleaned_text) # # Limit chunks for CPU performance # chunks = chunks[:max_chunks] # # Generate flashcards # all_flashcards = [] # total_chunks = len(chunks) # for i, chunk in enumerate(chunks): # progress = f"🎴 Generating flashcards... ({i+1}/{total_chunks} chunks processed)" # yield progress, "", "", "Processing..." # cards = generate_qa_pairs(chunk, questions_per_chunk) # all_flashcards.extend(cards) # if not all_flashcards: # yield "Could not generate flashcards from this PDF. Try a PDF with more textual content.", "", "", "No flashcards generated" # return # # Format output # yield "✅ Finalizing...", "", "", "Almost done..." # # Create formatted display # display_text = format_flashcards_display(all_flashcards) # # Create JSON download # json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False) # # Create Anki/CSV format # csv_lines = ["Question,Answer"] # for card in all_flashcards: # q = card['question'].replace('"', '""') # a = card['answer'].replace('"', '""') # csv_lines.append(f'"{q}","{a}"') # csv_output = "\n".join(csv_lines) # # FINAL OUTPUT - this updates all components # yield "✅ Done! Generated {} flashcards".format(len(all_flashcards)), csv_output, json_output, display_text # except Exception as e: # error_msg = f"Error processing PDF: {str(e)}" # print(error_msg) # yield error_msg, "", "", error_msg # def format_flashcards_display(flashcards: List[Dict]) -> str: # """Format flashcards for nice display.""" # lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"] # for i, card in enumerate(flashcards, 1): # lines.append(f"### Card {i}") # lines.append(f"**Q:** {card['question']}") # lines.append(f"**A:** {card['answer']}") # lines.append(f"*Context: {card['context'][:100]}...*\n") # lines.append("---\n") # return "\n".join(lines) # def create_sample_flashcard(): # """Create a sample flashcard for demo purposes.""" # sample = [{ # "question": "What is the capital of France?", # "answer": "Paris is the capital and most populous city of France.", # "context": "Paris is the capital and most populous city of France..." # }] # return format_flashcards_display(sample) # # Custom CSS for better styling # custom_css = """ # .flashcard-container { # border: 2px solid #e0e0e0; # border-radius: 10px; # padding: 20px; # margin: 10px 0; # background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); # color: white; # } # .question { # font-size: 1.2em; # font-weight: bold; # margin-bottom: 10px; # } # .answer { # font-size: 1em; # opacity: 0.9; # } # """ # # Gradio Interface # with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo: # gr.Markdown(""" # # 📚 PDF to Flashcards Generator # Upload any PDF document and automatically generate study flashcards (Q&A pairs) using AI. # **Features:** # - 🧠 Uses local CPU-friendly AI (no GPU needed) # - 📄 Extracts text from any PDF # - ✂️ Intelligently chunks content # - 🎴 Generates question-answer pairs # - 💾 Export to CSV (Anki-compatible) or JSON # *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.* # """) # with gr.Row(): # with gr.Column(scale=1): # pdf_input = gr.File( # label="Upload PDF", # file_types=[".pdf"], # type="filepath" # ) # with gr.Row(): # questions_per_chunk = gr.Slider( # minimum=1, # maximum=5, # value=2, # step=1, # label="Questions per section" # ) # max_chunks = gr.Slider( # minimum=5, # maximum=50, # value=20, # step=5, # label="Max sections to process" # ) # process_btn = gr.Button("🚀 Generate Flashcards", variant="primary") # gr.Markdown(""" # ### 💡 Tips: # - Text-based PDFs work best (scanned images won't work) # - Academic papers and articles work great # - Adjust "Questions per section" based on content density # """) # with gr.Column(scale=2): # status_text = gr.Textbox( # label="Status", # value="Ready to process PDF...", # interactive=False # ) # output_display = gr.Markdown( # label="Generated Flashcards", # value="Your flashcards will appear here..." # ) # with gr.Row(): # with gr.Column(): # csv_output = gr.Textbox( # label="CSV Format (for Anki import)", # lines=10, # visible=True # ) # gr.Markdown("*Copy the CSV content and save as `.csv` file to import into Anki*") # with gr.Column(): # json_output = gr.Textbox( # label="JSON Format", # lines=10, # visible=True # ) # gr.Markdown("*Raw JSON data for custom applications*") # # FIXED: Direct binding without the broken .then() chain # process_btn.click( # fn=process_pdf, # inputs=[pdf_input, questions_per_chunk, max_chunks], # outputs=[status_text, csv_output, json_output, output_display] # ) # # Example section # gr.Markdown("---") # gr.Markdown("### 🎯 Example Output Format") # gr.Markdown(create_sample_flashcard()) # if __name__ == "__main__": # demo.launch() import gradio as gr import PyPDF2 import re import json from typing import List, Dict from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch import tempfile import os # Initialize the model and tokenizer directly print("Loading models... This may take a minute on first run.") model_name = "valhalla/t5-small-qg-hl" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # Set to evaluation mode and CPU model.eval() device = torch.device("cpu") model.to(device) def extract_key_phrases(text: str) -> List[str]: """Extract potential answer candidates from text.""" # Look for noun phrases, named entities, and important concepts candidates = [] # Pattern for capitalized words/phrases (potential named entities) capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text) candidates.extend(capitalized[:3]) # Pattern for technical terms or concepts (words with specific patterns) # Look for phrases like "the process of X", "the concept of X", etc. concept_patterns = [ r'(?:process|method|technique|approach|concept|theory|principle|system) of ([^,.]{10,50})', r'(?:known as|called|termed|referred to as) ([^,.]{5,40})', r'(?:is|are|was|were) (\w+(?:\s+\w+){1,4}) (?:that|which|who)', ] for pattern in concept_patterns: matches = re.findall(pattern, text, re.IGNORECASE) candidates.extend(matches[:2]) # Clean and deduplicate candidates = [c.strip() for c in candidates if len(c.strip()) > 5] return list(dict.fromkeys(candidates))[:5] # Remove duplicates, keep order def generate_questions(context: str, answer: str, question_type: str = "what", max_length: int = 128) -> str: """Generate a question using T5 model with specified type.""" try: # Format: "generate question: answer context" input_text = f"generate question: {answer} {context}" # Tokenize inputs = tokenizer( input_text, return_tensors="pt", max_length=512, truncation=True, padding=True ).to(device) # Generate with different parameters based on question type temperature = 0.7 if question_type == "what" else 0.85 num_beams = 4 if question_type == "what" else 5 # Generate with torch.no_grad(): outputs = model.generate( **inputs, max_length=max_length, num_beams=num_beams, early_stopping=True, do_sample=True, temperature=temperature ) # Decode question = tokenizer.decode(outputs[0], skip_special_tokens=True) # Clean up question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip() # Post-process to improve question quality question = improve_question(question, answer, context, question_type) return question if len(question) > 10 else "" except Exception as e: print(f"Error generating question: {e}") return "" def improve_question(question: str, answer: str, context: str, question_type: str) -> str: """Post-process generated questions to improve quality and add variety.""" # Ensure question ends with question mark if not question.endswith('?'): question = question.rstrip('.') + '?' # Capitalize first letter question = question[0].upper() + question[1:] if question else question # Try to transform to why/how questions if specified if question_type == "why" and not question.lower().startswith("why"): # Try to convert to why question if re.search(r'\b(is|are|was|were|does|do|did)\b', question, re.IGNORECASE): question = create_why_question(question, answer, context) elif question_type == "how" and not question.lower().startswith("how"): # Try to convert to how question if re.search(r'\b(does|do|did|can|could)\b', question, re.IGNORECASE): question = create_how_question(question, answer, context) return question def create_why_question(base_question: str, answer: str, context: str) -> str: """Transform or create a 'why' question.""" # Look for causal indicators in the context causal_patterns = [ r'because ([^,.]{10,60})', r'due to ([^,.]{10,60})', r'as a result of ([^,.]{10,60})', r'(?:leads to|causes|results in) ([^,.]{10,60})', r'in order to ([^,.]{10,60})' ] for pattern in causal_patterns: match = re.search(pattern, context, re.IGNORECASE) if match: # Extract the subject from context subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+(?:is|are|was|were|does|do)', context) if subject_match: subject = subject_match.group(1) return f"Why does {subject.lower()} occur?" # Fallback: create a generic why question # Extract main subject from answer words = answer.split() if len(words) > 3: return f"Why is {' '.join(words[:4])}... important?" return base_question def create_how_question(base_question: str, answer: str, context: str) -> str: """Transform or create a 'how' question.""" # Look for process indicators process_patterns = [ r'(process|method|procedure|technique|approach) (?:of|for|to) ([^,.]{10,60})', r'by ([^,.]{10,60})', r'through ([^,.]{10,60})' ] for pattern in process_patterns: match = re.search(pattern, context, re.IGNORECASE) if match: if len(match.groups()) > 1: process = match.group(2) return f"How does {process.lower()} work?" else: process = match.group(1) return f"How is {process.lower()} achieved?" # Fallback: create a generic how question verbs = re.findall(r'\b(works?|functions?|operates?|performs?|executes?)\b', context, re.IGNORECASE) if verbs: subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+' + verbs[0], context, re.IGNORECASE) if subject_match: subject = subject_match.group(1) return f"How does {subject.lower()} {verbs[0].lower()}?" return base_question def extract_text_from_pdf(pdf_file) -> str: """Extract text from uploaded PDF file.""" text = "" try: if isinstance(pdf_file, str): pdf_reader = PyPDF2.PdfReader(pdf_file) else: pdf_reader = PyPDF2.PdfReader(pdf_file) for page in pdf_reader.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" except Exception as e: return f"Error reading PDF: {str(e)}" return text def clean_text(text: str) -> str: """Clean and preprocess extracted text.""" # Remove excessive whitespace text = re.sub(r'\s+', ' ', text) # Remove special characters but keep sentence structure text = re.sub(r'[^\w\s.,;!?-]', '', text) return text.strip() def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[str]: """Split text into overlapping chunks for processing.""" sentences = re.split(r'(?<=[.!?])\s+', text) chunks = [] current_chunk = "" for sentence in sentences: if len(current_chunk) + len(sentence) < max_chunk_size: current_chunk += " " + sentence else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence if current_chunk: chunks.append(current_chunk.strip()) # Add overlap between chunks for context overlapped_chunks = [] for i, chunk in enumerate(chunks): if i > 0 and overlap > 0: prev_sentences = chunks[i-1].split('. ') overlap_text = '. '.join(prev_sentences[-2:]) if len(prev_sentences) > 1 else chunks[i-1][-overlap:] chunk = overlap_text + " " + chunk overlapped_chunks.append(chunk) return overlapped_chunks def generate_qa_pairs(chunk: str, num_questions: int = 3) -> List[Dict[str, str]]: """Generate question-answer pairs from a text chunk with variety.""" flashcards = [] # Skip chunks that are too short words = chunk.split() if len(words) < 20: return [] try: # Extract key phrases for answers key_phrases = extract_key_phrases(chunk) # Also use sentences as potential answers sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20] # Combine both sources answer_candidates = key_phrases + sentences[:2] if len(answer_candidates) < 1: return [] # Define question types to generate question_types = ["what", "why", "how"] # Generate diverse questions questions_generated = 0 for i, answer in enumerate(answer_candidates): if questions_generated >= num_questions: break # Skip very short answers if len(answer.split()) < 3: continue # Cycle through question types q_type = question_types[i % len(question_types)] question = generate_questions(chunk, answer, question_type=q_type) if question and question != answer: # Make sure they're different flashcards.append({ "question": question, "answer": answer, "context": chunk[:200] + "..." if len(chunk) > 200 else chunk, "type": q_type }) questions_generated += 1 except Exception as e: print(f"Error generating QA: {e}") return flashcards def process_pdf(pdf_file, questions_per_chunk: int = 3, max_chunks: int = 20): """Main processing function.""" if pdf_file is None: return "Please upload a PDF file.", "", "", "Your flashcards will appear here..." try: # Extract text yield "📄 Extracting text from PDF...", "", "", "Processing..." raw_text = extract_text_from_pdf(pdf_file) if raw_text.startswith("Error"): yield raw_text, "", "", "Error occurred" return if len(raw_text.strip()) < 100: yield "PDF appears to be empty or contains no extractable text.", "", "", "Error occurred" return # Clean text yield "🧹 Cleaning text...", "", "", "Processing..." cleaned_text = clean_text(raw_text) # Chunk text yield "✂️ Chunking text into sections...", "", "", "Processing..." chunks = chunk_text(cleaned_text) # Limit chunks for CPU performance chunks = chunks[:max_chunks] # Generate flashcards all_flashcards = [] total_chunks = len(chunks) for i, chunk in enumerate(chunks): progress = f"🎴 Generating flashcards... ({i+1}/{total_chunks} chunks processed)" yield progress, "", "", "Processing..." cards = generate_qa_pairs(chunk, questions_per_chunk) all_flashcards.extend(cards) if not all_flashcards: yield "Could not generate flashcards from this PDF. Try a PDF with more textual content.", "", "", "No flashcards generated" return # Format output yield "✅ Finalizing...", "", "", "Almost done..." # Create formatted display display_text = format_flashcards_display(all_flashcards) # Create JSON download json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False) # Create Anki/CSV format csv_lines = ["Question,Answer,Type"] for card in all_flashcards: q = card['question'].replace('"', '""') a = card['answer'].replace('"', '""') t = card.get('type', 'what') csv_lines.append(f'"{q}","{a}","{t}"') csv_output = "\n".join(csv_lines) # FINAL OUTPUT - this updates all components stats = f"✅ Done! Generated {len(all_flashcards)} flashcards (" types_count = {} for card in all_flashcards: t = card.get('type', 'what') types_count[t] = types_count.get(t, 0) + 1 stats += ", ".join([f"{count} {qtype}" for qtype, count in types_count.items()]) + ")" yield stats, csv_output, json_output, display_text except Exception as e: error_msg = f"Error processing PDF: {str(e)}" print(error_msg) yield error_msg, "", "", error_msg def format_flashcards_display(flashcards: List[Dict]) -> str: """Format flashcards for nice display.""" lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"] # Count by type types_count = {} for card in flashcards: t = card.get('type', 'what') types_count[t] = types_count.get(t, 0) + 1 lines.append(f"**Breakdown:** {', '.join([f'{count} {qtype.upper()}' for qtype, count in types_count.items()])}\n") lines.append("---\n") for i, card in enumerate(flashcards, 1): qtype = card.get('type', 'what').upper() emoji = "❓" if qtype == "WHAT" else "🤔" if qtype == "WHY" else "🔧" lines.append(f"### {emoji} Card {i} - {qtype}") lines.append(f"**Q:** {card['question']}") lines.append(f"**A:** {card['answer']}") lines.append(f"*Context: {card['context'][:100]}...*\n") lines.append("---\n") return "\n".join(lines) def create_sample_flashcard(): """Create a sample flashcard for demo purposes.""" sample = [ { "question": "What is photosynthesis?", "answer": "Photosynthesis is the process by which plants convert sunlight into energy.", "context": "Photosynthesis is the process by which plants convert sunlight into energy...", "type": "what" }, { "question": "Why do plants need chlorophyll?", "answer": "Chlorophyll absorbs light energy needed for photosynthesis.", "context": "Chlorophyll absorbs light energy needed for photosynthesis...", "type": "why" }, { "question": "How do plants convert light into chemical energy?", "answer": "Through the process of photosynthesis in the chloroplasts.", "context": "Through the process of photosynthesis in the chloroplasts...", "type": "how" } ] return format_flashcards_display(sample) # Custom CSS for better styling custom_css = """ .flashcard-container { border: 2px solid #e0e0e0; border-radius: 10px; padding: 20px; margin: 10px 0; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; } .question { font-size: 1.2em; font-weight: bold; margin-bottom: 10px; } .answer { font-size: 1em; opacity: 0.9; } """ # Gradio Interface with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo: gr.Markdown(""" # 📚 PDF to Flashcards Generator (Enhanced) Upload any PDF document and automatically generate study flashcards with **What, Why, and How** questions using AI. **✨ New Features:** - 🎯 Generates **What** questions (factual) - 🤔 Generates **Why** questions (reasoning) - 🔧 Generates **How** questions (process) - 📊 Improved question quality and variety - 🧠 Better answer extraction **Core Features:** - 🧠 Uses local CPU-friendly AI (no GPU needed) - 📄 Extracts text from any PDF - ✂️ Intelligently chunks content - 🎴 Generates diverse question-answer pairs - 💾 Export to CSV (Anki-compatible) or JSON *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.* """) with gr.Row(): with gr.Column(scale=1): pdf_input = gr.File( label="Upload PDF", file_types=[".pdf"], type="filepath" ) with gr.Row(): questions_per_chunk = gr.Slider( minimum=1, maximum=6, value=3, step=1, label="Questions per section" ) max_chunks = gr.Slider( minimum=5, maximum=50, value=20, step=5, label="Max sections to process" ) process_btn = gr.Button("🚀 Generate Flashcards", variant="primary") gr.Markdown(""" ### 💡 Tips: - Text-based PDFs work best (scanned images won't work) - Academic papers and articles work great - Adjust "Questions per section" for more variety - Higher questions per section = more Why/How questions """) with gr.Column(scale=2): status_text = gr.Textbox( label="Status", value="Ready to process PDF...", interactive=False ) output_display = gr.Markdown( label="Generated Flashcards", value="Your flashcards will appear here..." ) with gr.Row(): with gr.Column(): csv_output = gr.Textbox( label="CSV Format (for Anki import)", lines=10, visible=True ) gr.Markdown("*Copy the CSV content and save as `.csv` file to import into Anki*") with gr.Column(): json_output = gr.Textbox( label="JSON Format", lines=10, visible=True ) gr.Markdown("*Raw JSON data for custom applications*") # Direct binding process_btn.click( fn=process_pdf, inputs=[pdf_input, questions_per_chunk, max_chunks], outputs=[status_text, csv_output, json_output, output_display] ) # Example section gr.Markdown("---") gr.Markdown("### 🎯 Example Output Format") gr.Markdown(create_sample_flashcard()) if __name__ == "__main__": demo.launch()