heerjtdev commited on
Commit
d818498
Β·
verified Β·
1 Parent(s): 3eeedea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +575 -31
app.py CHANGED
@@ -1,3 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import PyPDF2
3
  import re
@@ -20,8 +391,33 @@ model.eval()
20
  device = torch.device("cpu")
21
  model.to(device)
22
 
23
- def generate_questions(context: str, answer: str, max_length: int = 128) -> str:
24
- """Generate a question using T5 model."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  try:
26
  # Format: "generate question: <hl> answer <hl> context"
27
  input_text = f"generate question: <hl> {answer} <hl> {context}"
@@ -35,15 +431,19 @@ def generate_questions(context: str, answer: str, max_length: int = 128) -> str:
35
  padding=True
36
  ).to(device)
37
 
 
 
 
 
38
  # Generate
39
  with torch.no_grad():
40
  outputs = model.generate(
41
  **inputs,
42
  max_length=max_length,
43
- num_beams=4,
44
  early_stopping=True,
45
  do_sample=True,
46
- temperature=0.7
47
  )
48
 
49
  # Decode
@@ -52,12 +452,97 @@ def generate_questions(context: str, answer: str, max_length: int = 128) -> str:
52
  # Clean up
53
  question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip()
54
 
 
 
 
55
  return question if len(question) > 10 else ""
56
 
57
  except Exception as e:
58
  print(f"Error generating question: {e}")
59
  return ""
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def extract_text_from_pdf(pdf_file) -> str:
62
  """Extract text from uploaded PDF file."""
63
  text = ""
@@ -112,8 +597,8 @@ def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[
112
 
113
  return overlapped_chunks
114
 
115
- def generate_qa_pairs(chunk: str, num_questions: int = 2) -> List[Dict[str, str]]:
116
- """Generate question-answer pairs from a text chunk."""
117
  flashcards = []
118
 
119
  # Skip chunks that are too short
@@ -122,35 +607,51 @@ def generate_qa_pairs(chunk: str, num_questions: int = 2) -> List[Dict[str, str]
122
  return []
123
 
124
  try:
125
- # Split into sentences to use as answers
 
 
 
126
  sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20]
127
 
128
- if len(sentences) < 1:
 
 
 
129
  return []
130
 
131
- # Generate questions for different sentences
132
- for i in range(min(num_questions, len(sentences))):
133
- answer = sentences[i]
 
 
 
 
 
134
 
135
  # Skip very short answers
136
  if len(answer.split()) < 3:
137
  continue
138
 
139
- question = generate_questions(chunk, answer)
 
 
 
140
 
141
  if question and question != answer: # Make sure they're different
142
  flashcards.append({
143
  "question": question,
144
  "answer": answer,
145
- "context": chunk[:200] + "..." if len(chunk) > 200 else chunk
 
146
  })
 
147
 
148
  except Exception as e:
149
  print(f"Error generating QA: {e}")
150
 
151
  return flashcards
152
 
153
- def process_pdf(pdf_file, questions_per_chunk: int = 2, max_chunks: int = 20):
154
  """Main processing function."""
155
  if pdf_file is None:
156
  return "Please upload a PDF file.", "", "", "Your flashcards will appear here..."
@@ -204,15 +705,23 @@ def process_pdf(pdf_file, questions_per_chunk: int = 2, max_chunks: int = 20):
204
  json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False)
205
 
206
  # Create Anki/CSV format
207
- csv_lines = ["Question,Answer"]
208
  for card in all_flashcards:
209
  q = card['question'].replace('"', '""')
210
  a = card['answer'].replace('"', '""')
211
- csv_lines.append(f'"{q}","{a}"')
 
212
  csv_output = "\n".join(csv_lines)
213
 
214
  # FINAL OUTPUT - this updates all components
215
- yield "βœ… Done! Generated {} flashcards".format(len(all_flashcards)), csv_output, json_output, display_text
 
 
 
 
 
 
 
216
 
217
  except Exception as e:
218
  error_msg = f"Error processing PDF: {str(e)}"
@@ -223,8 +732,20 @@ def format_flashcards_display(flashcards: List[Dict]) -> str:
223
  """Format flashcards for nice display."""
224
  lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"]
225
 
 
 
 
 
 
 
 
 
 
226
  for i, card in enumerate(flashcards, 1):
227
- lines.append(f"### Card {i}")
 
 
 
228
  lines.append(f"**Q:** {card['question']}")
229
  lines.append(f"**A:** {card['answer']}")
230
  lines.append(f"*Context: {card['context'][:100]}...*\n")
@@ -234,11 +755,26 @@ def format_flashcards_display(flashcards: List[Dict]) -> str:
234
 
235
  def create_sample_flashcard():
236
  """Create a sample flashcard for demo purposes."""
237
- sample = [{
238
- "question": "What is the capital of France?",
239
- "answer": "Paris is the capital and most populous city of France.",
240
- "context": "Paris is the capital and most populous city of France..."
241
- }]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  return format_flashcards_display(sample)
243
 
244
  # Custom CSS for better styling
@@ -265,15 +801,22 @@ custom_css = """
265
  # Gradio Interface
266
  with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
267
  gr.Markdown("""
268
- # πŸ“š PDF to Flashcards Generator
 
 
269
 
270
- Upload any PDF document and automatically generate study flashcards (Q&A pairs) using AI.
 
 
 
 
 
271
 
272
- **Features:**
273
  - 🧠 Uses local CPU-friendly AI (no GPU needed)
274
  - πŸ“„ Extracts text from any PDF
275
  - βœ‚οΈ Intelligently chunks content
276
- - 🎴 Generates question-answer pairs
277
  - πŸ’Ύ Export to CSV (Anki-compatible) or JSON
278
 
279
  *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.*
@@ -290,8 +833,8 @@ with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
290
  with gr.Row():
291
  questions_per_chunk = gr.Slider(
292
  minimum=1,
293
- maximum=5,
294
- value=2,
295
  step=1,
296
  label="Questions per section"
297
  )
@@ -309,7 +852,8 @@ with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
309
  ### πŸ’‘ Tips:
310
  - Text-based PDFs work best (scanned images won't work)
311
  - Academic papers and articles work great
312
- - Adjust "Questions per section" based on content density
 
313
  """)
314
 
315
  with gr.Column(scale=2):
@@ -341,7 +885,7 @@ with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
341
  )
342
  gr.Markdown("*Raw JSON data for custom applications*")
343
 
344
- # FIXED: Direct binding without the broken .then() chain
345
  process_btn.click(
346
  fn=process_pdf,
347
  inputs=[pdf_input, questions_per_chunk, max_chunks],
 
1
+ # import gradio as gr
2
+ # import PyPDF2
3
+ # import re
4
+ # import json
5
+ # from typing import List, Dict
6
+ # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
+ # import torch
8
+ # import tempfile
9
+ # import os
10
+
11
+ # # Initialize the model and tokenizer directly
12
+ # print("Loading models... This may take a minute on first run.")
13
+
14
+ # model_name = "valhalla/t5-small-qg-hl"
15
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ # model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
17
+
18
+ # # Set to evaluation mode and CPU
19
+ # model.eval()
20
+ # device = torch.device("cpu")
21
+ # model.to(device)
22
+
23
+ # def generate_questions(context: str, answer: str, max_length: int = 128) -> str:
24
+ # """Generate a question using T5 model."""
25
+ # try:
26
+ # # Format: "generate question: <hl> answer <hl> context"
27
+ # input_text = f"generate question: <hl> {answer} <hl> {context}"
28
+
29
+ # # Tokenize
30
+ # inputs = tokenizer(
31
+ # input_text,
32
+ # return_tensors="pt",
33
+ # max_length=512,
34
+ # truncation=True,
35
+ # padding=True
36
+ # ).to(device)
37
+
38
+ # # Generate
39
+ # with torch.no_grad():
40
+ # outputs = model.generate(
41
+ # **inputs,
42
+ # max_length=max_length,
43
+ # num_beams=4,
44
+ # early_stopping=True,
45
+ # do_sample=True,
46
+ # temperature=0.7
47
+ # )
48
+
49
+ # # Decode
50
+ # question = tokenizer.decode(outputs[0], skip_special_tokens=True)
51
+
52
+ # # Clean up
53
+ # question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip()
54
+
55
+ # return question if len(question) > 10 else ""
56
+
57
+ # except Exception as e:
58
+ # print(f"Error generating question: {e}")
59
+ # return ""
60
+
61
+ # def extract_text_from_pdf(pdf_file) -> str:
62
+ # """Extract text from uploaded PDF file."""
63
+ # text = ""
64
+ # try:
65
+ # if isinstance(pdf_file, str):
66
+ # pdf_reader = PyPDF2.PdfReader(pdf_file)
67
+ # else:
68
+ # pdf_reader = PyPDF2.PdfReader(pdf_file)
69
+
70
+ # for page in pdf_reader.pages:
71
+ # page_text = page.extract_text()
72
+ # if page_text:
73
+ # text += page_text + "\n"
74
+ # except Exception as e:
75
+ # return f"Error reading PDF: {str(e)}"
76
+
77
+ # return text
78
+
79
+ # def clean_text(text: str) -> str:
80
+ # """Clean and preprocess extracted text."""
81
+ # # Remove excessive whitespace
82
+ # text = re.sub(r'\s+', ' ', text)
83
+ # # Remove special characters but keep sentence structure
84
+ # text = re.sub(r'[^\w\s.,;!?-]', '', text)
85
+ # return text.strip()
86
+
87
+ # def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[str]:
88
+ # """Split text into overlapping chunks for processing."""
89
+ # sentences = re.split(r'(?<=[.!?])\s+', text)
90
+ # chunks = []
91
+ # current_chunk = ""
92
+
93
+ # for sentence in sentences:
94
+ # if len(current_chunk) + len(sentence) < max_chunk_size:
95
+ # current_chunk += " " + sentence
96
+ # else:
97
+ # if current_chunk:
98
+ # chunks.append(current_chunk.strip())
99
+ # current_chunk = sentence
100
+
101
+ # if current_chunk:
102
+ # chunks.append(current_chunk.strip())
103
+
104
+ # # Add overlap between chunks for context
105
+ # overlapped_chunks = []
106
+ # for i, chunk in enumerate(chunks):
107
+ # if i > 0 and overlap > 0:
108
+ # prev_sentences = chunks[i-1].split('. ')
109
+ # overlap_text = '. '.join(prev_sentences[-2:]) if len(prev_sentences) > 1 else chunks[i-1][-overlap:]
110
+ # chunk = overlap_text + " " + chunk
111
+ # overlapped_chunks.append(chunk)
112
+
113
+ # return overlapped_chunks
114
+
115
+ # def generate_qa_pairs(chunk: str, num_questions: int = 2) -> List[Dict[str, str]]:
116
+ # """Generate question-answer pairs from a text chunk."""
117
+ # flashcards = []
118
+
119
+ # # Skip chunks that are too short
120
+ # words = chunk.split()
121
+ # if len(words) < 20:
122
+ # return []
123
+
124
+ # try:
125
+ # # Split into sentences to use as answers
126
+ # sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20]
127
+
128
+ # if len(sentences) < 1:
129
+ # return []
130
+
131
+ # # Generate questions for different sentences
132
+ # for i in range(min(num_questions, len(sentences))):
133
+ # answer = sentences[i]
134
+
135
+ # # Skip very short answers
136
+ # if len(answer.split()) < 3:
137
+ # continue
138
+
139
+ # question = generate_questions(chunk, answer)
140
+
141
+ # if question and question != answer: # Make sure they're different
142
+ # flashcards.append({
143
+ # "question": question,
144
+ # "answer": answer,
145
+ # "context": chunk[:200] + "..." if len(chunk) > 200 else chunk
146
+ # })
147
+
148
+ # except Exception as e:
149
+ # print(f"Error generating QA: {e}")
150
+
151
+ # return flashcards
152
+
153
+ # def process_pdf(pdf_file, questions_per_chunk: int = 2, max_chunks: int = 20):
154
+ # """Main processing function."""
155
+ # if pdf_file is None:
156
+ # return "Please upload a PDF file.", "", "", "Your flashcards will appear here..."
157
+
158
+ # try:
159
+ # # Extract text
160
+ # yield "πŸ“„ Extracting text from PDF...", "", "", "Processing..."
161
+ # raw_text = extract_text_from_pdf(pdf_file)
162
+
163
+ # if raw_text.startswith("Error"):
164
+ # yield raw_text, "", "", "Error occurred"
165
+ # return
166
+
167
+ # if len(raw_text.strip()) < 100:
168
+ # yield "PDF appears to be empty or contains no extractable text.", "", "", "Error occurred"
169
+ # return
170
+
171
+ # # Clean text
172
+ # yield "🧹 Cleaning text...", "", "", "Processing..."
173
+ # cleaned_text = clean_text(raw_text)
174
+
175
+ # # Chunk text
176
+ # yield "βœ‚οΈ Chunking text into sections...", "", "", "Processing..."
177
+ # chunks = chunk_text(cleaned_text)
178
+
179
+ # # Limit chunks for CPU performance
180
+ # chunks = chunks[:max_chunks]
181
+
182
+ # # Generate flashcards
183
+ # all_flashcards = []
184
+ # total_chunks = len(chunks)
185
+
186
+ # for i, chunk in enumerate(chunks):
187
+ # progress = f"🎴 Generating flashcards... ({i+1}/{total_chunks} chunks processed)"
188
+ # yield progress, "", "", "Processing..."
189
+
190
+ # cards = generate_qa_pairs(chunk, questions_per_chunk)
191
+ # all_flashcards.extend(cards)
192
+
193
+ # if not all_flashcards:
194
+ # yield "Could not generate flashcards from this PDF. Try a PDF with more textual content.", "", "", "No flashcards generated"
195
+ # return
196
+
197
+ # # Format output
198
+ # yield "βœ… Finalizing...", "", "", "Almost done..."
199
+
200
+ # # Create formatted display
201
+ # display_text = format_flashcards_display(all_flashcards)
202
+
203
+ # # Create JSON download
204
+ # json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False)
205
+
206
+ # # Create Anki/CSV format
207
+ # csv_lines = ["Question,Answer"]
208
+ # for card in all_flashcards:
209
+ # q = card['question'].replace('"', '""')
210
+ # a = card['answer'].replace('"', '""')
211
+ # csv_lines.append(f'"{q}","{a}"')
212
+ # csv_output = "\n".join(csv_lines)
213
+
214
+ # # FINAL OUTPUT - this updates all components
215
+ # yield "βœ… Done! Generated {} flashcards".format(len(all_flashcards)), csv_output, json_output, display_text
216
+
217
+ # except Exception as e:
218
+ # error_msg = f"Error processing PDF: {str(e)}"
219
+ # print(error_msg)
220
+ # yield error_msg, "", "", error_msg
221
+
222
+ # def format_flashcards_display(flashcards: List[Dict]) -> str:
223
+ # """Format flashcards for nice display."""
224
+ # lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"]
225
+
226
+ # for i, card in enumerate(flashcards, 1):
227
+ # lines.append(f"### Card {i}")
228
+ # lines.append(f"**Q:** {card['question']}")
229
+ # lines.append(f"**A:** {card['answer']}")
230
+ # lines.append(f"*Context: {card['context'][:100]}...*\n")
231
+ # lines.append("---\n")
232
+
233
+ # return "\n".join(lines)
234
+
235
+ # def create_sample_flashcard():
236
+ # """Create a sample flashcard for demo purposes."""
237
+ # sample = [{
238
+ # "question": "What is the capital of France?",
239
+ # "answer": "Paris is the capital and most populous city of France.",
240
+ # "context": "Paris is the capital and most populous city of France..."
241
+ # }]
242
+ # return format_flashcards_display(sample)
243
+
244
+ # # Custom CSS for better styling
245
+ # custom_css = """
246
+ # .flashcard-container {
247
+ # border: 2px solid #e0e0e0;
248
+ # border-radius: 10px;
249
+ # padding: 20px;
250
+ # margin: 10px 0;
251
+ # background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
252
+ # color: white;
253
+ # }
254
+ # .question {
255
+ # font-size: 1.2em;
256
+ # font-weight: bold;
257
+ # margin-bottom: 10px;
258
+ # }
259
+ # .answer {
260
+ # font-size: 1em;
261
+ # opacity: 0.9;
262
+ # }
263
+ # """
264
+
265
+ # # Gradio Interface
266
+ # with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
267
+ # gr.Markdown("""
268
+ # # πŸ“š PDF to Flashcards Generator
269
+
270
+ # Upload any PDF document and automatically generate study flashcards (Q&A pairs) using AI.
271
+
272
+ # **Features:**
273
+ # - 🧠 Uses local CPU-friendly AI (no GPU needed)
274
+ # - πŸ“„ Extracts text from any PDF
275
+ # - βœ‚οΈ Intelligently chunks content
276
+ # - 🎴 Generates question-answer pairs
277
+ # - πŸ’Ύ Export to CSV (Anki-compatible) or JSON
278
+
279
+ # *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.*
280
+ # """)
281
+
282
+ # with gr.Row():
283
+ # with gr.Column(scale=1):
284
+ # pdf_input = gr.File(
285
+ # label="Upload PDF",
286
+ # file_types=[".pdf"],
287
+ # type="filepath"
288
+ # )
289
+
290
+ # with gr.Row():
291
+ # questions_per_chunk = gr.Slider(
292
+ # minimum=1,
293
+ # maximum=5,
294
+ # value=2,
295
+ # step=1,
296
+ # label="Questions per section"
297
+ # )
298
+ # max_chunks = gr.Slider(
299
+ # minimum=5,
300
+ # maximum=50,
301
+ # value=20,
302
+ # step=5,
303
+ # label="Max sections to process"
304
+ # )
305
+
306
+ # process_btn = gr.Button("πŸš€ Generate Flashcards", variant="primary")
307
+
308
+ # gr.Markdown("""
309
+ # ### πŸ’‘ Tips:
310
+ # - Text-based PDFs work best (scanned images won't work)
311
+ # - Academic papers and articles work great
312
+ # - Adjust "Questions per section" based on content density
313
+ # """)
314
+
315
+ # with gr.Column(scale=2):
316
+ # status_text = gr.Textbox(
317
+ # label="Status",
318
+ # value="Ready to process PDF...",
319
+ # interactive=False
320
+ # )
321
+
322
+ # output_display = gr.Markdown(
323
+ # label="Generated Flashcards",
324
+ # value="Your flashcards will appear here..."
325
+ # )
326
+
327
+ # with gr.Row():
328
+ # with gr.Column():
329
+ # csv_output = gr.Textbox(
330
+ # label="CSV Format (for Anki import)",
331
+ # lines=10,
332
+ # visible=True
333
+ # )
334
+ # gr.Markdown("*Copy the CSV content and save as `.csv` file to import into Anki*")
335
+
336
+ # with gr.Column():
337
+ # json_output = gr.Textbox(
338
+ # label="JSON Format",
339
+ # lines=10,
340
+ # visible=True
341
+ # )
342
+ # gr.Markdown("*Raw JSON data for custom applications*")
343
+
344
+ # # FIXED: Direct binding without the broken .then() chain
345
+ # process_btn.click(
346
+ # fn=process_pdf,
347
+ # inputs=[pdf_input, questions_per_chunk, max_chunks],
348
+ # outputs=[status_text, csv_output, json_output, output_display]
349
+ # )
350
+
351
+ # # Example section
352
+ # gr.Markdown("---")
353
+ # gr.Markdown("### 🎯 Example Output Format")
354
+ # gr.Markdown(create_sample_flashcard())
355
+
356
+ # if __name__ == "__main__":
357
+ # demo.launch()
358
+
359
+
360
+
361
+
362
+
363
+
364
+
365
+
366
+
367
+
368
+
369
+
370
+
371
+
372
  import gradio as gr
373
  import PyPDF2
374
  import re
 
391
  device = torch.device("cpu")
392
  model.to(device)
393
 
394
+ def extract_key_phrases(text: str) -> List[str]:
395
+ """Extract potential answer candidates from text."""
396
+ # Look for noun phrases, named entities, and important concepts
397
+ candidates = []
398
+
399
+ # Pattern for capitalized words/phrases (potential named entities)
400
+ capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
401
+ candidates.extend(capitalized[:3])
402
+
403
+ # Pattern for technical terms or concepts (words with specific patterns)
404
+ # Look for phrases like "the process of X", "the concept of X", etc.
405
+ concept_patterns = [
406
+ r'(?:process|method|technique|approach|concept|theory|principle|system) of ([^,.]{10,50})',
407
+ r'(?:known as|called|termed|referred to as) ([^,.]{5,40})',
408
+ r'(?:is|are|was|were) (\w+(?:\s+\w+){1,4}) (?:that|which|who)',
409
+ ]
410
+
411
+ for pattern in concept_patterns:
412
+ matches = re.findall(pattern, text, re.IGNORECASE)
413
+ candidates.extend(matches[:2])
414
+
415
+ # Clean and deduplicate
416
+ candidates = [c.strip() for c in candidates if len(c.strip()) > 5]
417
+ return list(dict.fromkeys(candidates))[:5] # Remove duplicates, keep order
418
+
419
+ def generate_questions(context: str, answer: str, question_type: str = "what", max_length: int = 128) -> str:
420
+ """Generate a question using T5 model with specified type."""
421
  try:
422
  # Format: "generate question: <hl> answer <hl> context"
423
  input_text = f"generate question: <hl> {answer} <hl> {context}"
 
431
  padding=True
432
  ).to(device)
433
 
434
+ # Generate with different parameters based on question type
435
+ temperature = 0.7 if question_type == "what" else 0.85
436
+ num_beams = 4 if question_type == "what" else 5
437
+
438
  # Generate
439
  with torch.no_grad():
440
  outputs = model.generate(
441
  **inputs,
442
  max_length=max_length,
443
+ num_beams=num_beams,
444
  early_stopping=True,
445
  do_sample=True,
446
+ temperature=temperature
447
  )
448
 
449
  # Decode
 
452
  # Clean up
453
  question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip()
454
 
455
+ # Post-process to improve question quality
456
+ question = improve_question(question, answer, context, question_type)
457
+
458
  return question if len(question) > 10 else ""
459
 
460
  except Exception as e:
461
  print(f"Error generating question: {e}")
462
  return ""
463
 
464
+ def improve_question(question: str, answer: str, context: str, question_type: str) -> str:
465
+ """Post-process generated questions to improve quality and add variety."""
466
+
467
+ # Ensure question ends with question mark
468
+ if not question.endswith('?'):
469
+ question = question.rstrip('.') + '?'
470
+
471
+ # Capitalize first letter
472
+ question = question[0].upper() + question[1:] if question else question
473
+
474
+ # Try to transform to why/how questions if specified
475
+ if question_type == "why" and not question.lower().startswith("why"):
476
+ # Try to convert to why question
477
+ if re.search(r'\b(is|are|was|were|does|do|did)\b', question, re.IGNORECASE):
478
+ question = create_why_question(question, answer, context)
479
+
480
+ elif question_type == "how" and not question.lower().startswith("how"):
481
+ # Try to convert to how question
482
+ if re.search(r'\b(does|do|did|can|could)\b', question, re.IGNORECASE):
483
+ question = create_how_question(question, answer, context)
484
+
485
+ return question
486
+
487
+ def create_why_question(base_question: str, answer: str, context: str) -> str:
488
+ """Transform or create a 'why' question."""
489
+
490
+ # Look for causal indicators in the context
491
+ causal_patterns = [
492
+ r'because ([^,.]{10,60})',
493
+ r'due to ([^,.]{10,60})',
494
+ r'as a result of ([^,.]{10,60})',
495
+ r'(?:leads to|causes|results in) ([^,.]{10,60})',
496
+ r'in order to ([^,.]{10,60})'
497
+ ]
498
+
499
+ for pattern in causal_patterns:
500
+ match = re.search(pattern, context, re.IGNORECASE)
501
+ if match:
502
+ # Extract the subject from context
503
+ subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+(?:is|are|was|were|does|do)', context)
504
+ if subject_match:
505
+ subject = subject_match.group(1)
506
+ return f"Why does {subject.lower()} occur?"
507
+
508
+ # Fallback: create a generic why question
509
+ # Extract main subject from answer
510
+ words = answer.split()
511
+ if len(words) > 3:
512
+ return f"Why is {' '.join(words[:4])}... important?"
513
+
514
+ return base_question
515
+
516
+ def create_how_question(base_question: str, answer: str, context: str) -> str:
517
+ """Transform or create a 'how' question."""
518
+
519
+ # Look for process indicators
520
+ process_patterns = [
521
+ r'(process|method|procedure|technique|approach) (?:of|for|to) ([^,.]{10,60})',
522
+ r'by ([^,.]{10,60})',
523
+ r'through ([^,.]{10,60})'
524
+ ]
525
+
526
+ for pattern in process_patterns:
527
+ match = re.search(pattern, context, re.IGNORECASE)
528
+ if match:
529
+ if len(match.groups()) > 1:
530
+ process = match.group(2)
531
+ return f"How does {process.lower()} work?"
532
+ else:
533
+ process = match.group(1)
534
+ return f"How is {process.lower()} achieved?"
535
+
536
+ # Fallback: create a generic how question
537
+ verbs = re.findall(r'\b(works?|functions?|operates?|performs?|executes?)\b', context, re.IGNORECASE)
538
+ if verbs:
539
+ subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+' + verbs[0], context, re.IGNORECASE)
540
+ if subject_match:
541
+ subject = subject_match.group(1)
542
+ return f"How does {subject.lower()} {verbs[0].lower()}?"
543
+
544
+ return base_question
545
+
546
  def extract_text_from_pdf(pdf_file) -> str:
547
  """Extract text from uploaded PDF file."""
548
  text = ""
 
597
 
598
  return overlapped_chunks
599
 
600
+ def generate_qa_pairs(chunk: str, num_questions: int = 3) -> List[Dict[str, str]]:
601
+ """Generate question-answer pairs from a text chunk with variety."""
602
  flashcards = []
603
 
604
  # Skip chunks that are too short
 
607
  return []
608
 
609
  try:
610
+ # Extract key phrases for answers
611
+ key_phrases = extract_key_phrases(chunk)
612
+
613
+ # Also use sentences as potential answers
614
  sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20]
615
 
616
+ # Combine both sources
617
+ answer_candidates = key_phrases + sentences[:2]
618
+
619
+ if len(answer_candidates) < 1:
620
  return []
621
 
622
+ # Define question types to generate
623
+ question_types = ["what", "why", "how"]
624
+
625
+ # Generate diverse questions
626
+ questions_generated = 0
627
+ for i, answer in enumerate(answer_candidates):
628
+ if questions_generated >= num_questions:
629
+ break
630
 
631
  # Skip very short answers
632
  if len(answer.split()) < 3:
633
  continue
634
 
635
+ # Cycle through question types
636
+ q_type = question_types[i % len(question_types)]
637
+
638
+ question = generate_questions(chunk, answer, question_type=q_type)
639
 
640
  if question and question != answer: # Make sure they're different
641
  flashcards.append({
642
  "question": question,
643
  "answer": answer,
644
+ "context": chunk[:200] + "..." if len(chunk) > 200 else chunk,
645
+ "type": q_type
646
  })
647
+ questions_generated += 1
648
 
649
  except Exception as e:
650
  print(f"Error generating QA: {e}")
651
 
652
  return flashcards
653
 
654
+ def process_pdf(pdf_file, questions_per_chunk: int = 3, max_chunks: int = 20):
655
  """Main processing function."""
656
  if pdf_file is None:
657
  return "Please upload a PDF file.", "", "", "Your flashcards will appear here..."
 
705
  json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False)
706
 
707
  # Create Anki/CSV format
708
+ csv_lines = ["Question,Answer,Type"]
709
  for card in all_flashcards:
710
  q = card['question'].replace('"', '""')
711
  a = card['answer'].replace('"', '""')
712
+ t = card.get('type', 'what')
713
+ csv_lines.append(f'"{q}","{a}","{t}"')
714
  csv_output = "\n".join(csv_lines)
715
 
716
  # FINAL OUTPUT - this updates all components
717
+ stats = f"βœ… Done! Generated {len(all_flashcards)} flashcards ("
718
+ types_count = {}
719
+ for card in all_flashcards:
720
+ t = card.get('type', 'what')
721
+ types_count[t] = types_count.get(t, 0) + 1
722
+ stats += ", ".join([f"{count} {qtype}" for qtype, count in types_count.items()]) + ")"
723
+
724
+ yield stats, csv_output, json_output, display_text
725
 
726
  except Exception as e:
727
  error_msg = f"Error processing PDF: {str(e)}"
 
732
  """Format flashcards for nice display."""
733
  lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"]
734
 
735
+ # Count by type
736
+ types_count = {}
737
+ for card in flashcards:
738
+ t = card.get('type', 'what')
739
+ types_count[t] = types_count.get(t, 0) + 1
740
+
741
+ lines.append(f"**Breakdown:** {', '.join([f'{count} {qtype.upper()}' for qtype, count in types_count.items()])}\n")
742
+ lines.append("---\n")
743
+
744
  for i, card in enumerate(flashcards, 1):
745
+ qtype = card.get('type', 'what').upper()
746
+ emoji = "❓" if qtype == "WHAT" else "πŸ€”" if qtype == "WHY" else "πŸ”§"
747
+
748
+ lines.append(f"### {emoji} Card {i} - {qtype}")
749
  lines.append(f"**Q:** {card['question']}")
750
  lines.append(f"**A:** {card['answer']}")
751
  lines.append(f"*Context: {card['context'][:100]}...*\n")
 
755
 
756
  def create_sample_flashcard():
757
  """Create a sample flashcard for demo purposes."""
758
+ sample = [
759
+ {
760
+ "question": "What is photosynthesis?",
761
+ "answer": "Photosynthesis is the process by which plants convert sunlight into energy.",
762
+ "context": "Photosynthesis is the process by which plants convert sunlight into energy...",
763
+ "type": "what"
764
+ },
765
+ {
766
+ "question": "Why do plants need chlorophyll?",
767
+ "answer": "Chlorophyll absorbs light energy needed for photosynthesis.",
768
+ "context": "Chlorophyll absorbs light energy needed for photosynthesis...",
769
+ "type": "why"
770
+ },
771
+ {
772
+ "question": "How do plants convert light into chemical energy?",
773
+ "answer": "Through the process of photosynthesis in the chloroplasts.",
774
+ "context": "Through the process of photosynthesis in the chloroplasts...",
775
+ "type": "how"
776
+ }
777
+ ]
778
  return format_flashcards_display(sample)
779
 
780
  # Custom CSS for better styling
 
801
  # Gradio Interface
802
  with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
803
  gr.Markdown("""
804
+ # πŸ“š PDF to Flashcards Generator (Enhanced)
805
+
806
+ Upload any PDF document and automatically generate study flashcards with **What, Why, and How** questions using AI.
807
 
808
+ **✨ New Features:**
809
+ - 🎯 Generates **What** questions (factual)
810
+ - πŸ€” Generates **Why** questions (reasoning)
811
+ - πŸ”§ Generates **How** questions (process)
812
+ - πŸ“Š Improved question quality and variety
813
+ - 🧠 Better answer extraction
814
 
815
+ **Core Features:**
816
  - 🧠 Uses local CPU-friendly AI (no GPU needed)
817
  - πŸ“„ Extracts text from any PDF
818
  - βœ‚οΈ Intelligently chunks content
819
+ - 🎴 Generates diverse question-answer pairs
820
  - πŸ’Ύ Export to CSV (Anki-compatible) or JSON
821
 
822
  *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.*
 
833
  with gr.Row():
834
  questions_per_chunk = gr.Slider(
835
  minimum=1,
836
+ maximum=6,
837
+ value=3,
838
  step=1,
839
  label="Questions per section"
840
  )
 
852
  ### πŸ’‘ Tips:
853
  - Text-based PDFs work best (scanned images won't work)
854
  - Academic papers and articles work great
855
+ - Adjust "Questions per section" for more variety
856
+ - Higher questions per section = more Why/How questions
857
  """)
858
 
859
  with gr.Column(scale=2):
 
885
  )
886
  gr.Markdown("*Raw JSON data for custom applications*")
887
 
888
+ # Direct binding
889
  process_btn.click(
890
  fn=process_pdf,
891
  inputs=[pdf_input, questions_per_chunk, max_chunks],