File size: 31,297 Bytes
d818498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72e0c96
c1ea31c
95abb5a
c1ea31c
7bf9c65
 
 
c1ea31c
 
95abb5a
7bf9c65
c1ea31c
7bf9c65
 
 
 
 
 
 
 
 
 
d818498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7bf9c65
 
 
 
 
 
 
 
 
 
 
 
 
d818498
 
 
 
7bf9c65
 
 
 
 
d818498
7bf9c65
 
d818498
7bf9c65
 
 
 
 
 
 
 
d818498
 
 
7bf9c65
 
 
 
 
95abb5a
d818498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1ea31c
 
 
 
 
 
 
 
 
 
7bf9c65
 
 
c1ea31c
 
 
 
95abb5a
c1ea31c
 
 
 
 
 
 
95abb5a
c1ea31c
 
 
 
 
95abb5a
c1ea31c
 
 
 
 
 
 
95abb5a
c1ea31c
 
95abb5a
c1ea31c
 
 
 
 
 
 
 
95abb5a
c1ea31c
 
d818498
 
c1ea31c
95abb5a
c1ea31c
7bf9c65
 
c1ea31c
95abb5a
c1ea31c
d818498
 
 
 
7bf9c65
 
d818498
 
 
 
c1ea31c
 
d818498
 
 
 
 
 
 
 
c1ea31c
7bf9c65
 
 
c1ea31c
d818498
 
 
 
c1ea31c
7bf9c65
c1ea31c
 
7bf9c65
d818498
 
c1ea31c
d818498
c1ea31c
 
 
 
 
 
d818498
c1ea31c
 
3eeedea
c1ea31c
 
 
3eeedea
c1ea31c
95abb5a
c1ea31c
3eeedea
 
95abb5a
c1ea31c
3eeedea
 
95abb5a
c1ea31c
3eeedea
c1ea31c
95abb5a
c1ea31c
3eeedea
c1ea31c
95abb5a
c1ea31c
 
95abb5a
c1ea31c
 
 
95abb5a
c1ea31c
 
3eeedea
c1ea31c
 
 
95abb5a
c1ea31c
3eeedea
 
95abb5a
c1ea31c
3eeedea
95abb5a
c1ea31c
 
95abb5a
c1ea31c
 
95abb5a
c1ea31c
d818498
c1ea31c
 
 
d818498
 
c1ea31c
95abb5a
3eeedea
d818498
 
 
 
 
 
 
 
95abb5a
c1ea31c
3eeedea
 
 
95abb5a
c1ea31c
 
 
 
d818498
 
 
 
 
 
 
 
 
c1ea31c
d818498
 
 
 
c1ea31c
 
 
 
 
 
95abb5a
c1ea31c
 
d818498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1ea31c
95abb5a
c1ea31c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95abb5a
c1ea31c
 
 
d818498
 
 
c1ea31c
d818498
 
 
 
 
 
c1ea31c
d818498
c1ea31c
 
 
d818498
c1ea31c
 
 
 
 
 
 
 
 
 
 
72e0c96
c1ea31c
 
 
 
d818498
 
c1ea31c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d818498
 
c1ea31c
95abb5a
c1ea31c
 
 
 
 
72e0c96
95abb5a
c1ea31c
 
 
72e0c96
95abb5a
72e0c96
c1ea31c
 
 
 
 
95abb5a
c1ea31c
 
 
 
 
 
 
 
 
 
d818498
c1ea31c
 
 
3eeedea
72e0c96
c1ea31c
 
 
 
 
72e0c96
 
95abb5a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
# import gradio as gr
# import PyPDF2
# import re
# import json
# from typing import List, Dict
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# import torch
# import tempfile
# import os

# # Initialize the model and tokenizer directly
# print("Loading models... This may take a minute on first run.")

# model_name = "valhalla/t5-small-qg-hl"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# # Set to evaluation mode and CPU
# model.eval()
# device = torch.device("cpu")
# model.to(device)

# def generate_questions(context: str, answer: str, max_length: int = 128) -> str:
#     """Generate a question using T5 model."""
#     try:
#         # Format: "generate question: <hl> answer <hl> context"
#         input_text = f"generate question: <hl> {answer} <hl> {context}"
        
#         # Tokenize
#         inputs = tokenizer(
#             input_text,
#             return_tensors="pt",
#             max_length=512,
#             truncation=True,
#             padding=True
#         ).to(device)
        
#         # Generate
#         with torch.no_grad():
#             outputs = model.generate(
#                 **inputs,
#                 max_length=max_length,
#                 num_beams=4,
#                 early_stopping=True,
#                 do_sample=True,
#                 temperature=0.7
#             )
        
#         # Decode
#         question = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
#         # Clean up
#         question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip()
        
#         return question if len(question) > 10 else ""
        
#     except Exception as e:
#         print(f"Error generating question: {e}")
#         return ""

# def extract_text_from_pdf(pdf_file) -> str:
#     """Extract text from uploaded PDF file."""
#     text = ""
#     try:
#         if isinstance(pdf_file, str):
#             pdf_reader = PyPDF2.PdfReader(pdf_file)
#         else:
#             pdf_reader = PyPDF2.PdfReader(pdf_file)
        
#         for page in pdf_reader.pages:
#             page_text = page.extract_text()
#             if page_text:
#                 text += page_text + "\n"
#     except Exception as e:
#         return f"Error reading PDF: {str(e)}"
    
#     return text

# def clean_text(text: str) -> str:
#     """Clean and preprocess extracted text."""
#     # Remove excessive whitespace
#     text = re.sub(r'\s+', ' ', text)
#     # Remove special characters but keep sentence structure
#     text = re.sub(r'[^\w\s.,;!?-]', '', text)
#     return text.strip()

# def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[str]:
#     """Split text into overlapping chunks for processing."""
#     sentences = re.split(r'(?<=[.!?])\s+', text)
#     chunks = []
#     current_chunk = ""
    
#     for sentence in sentences:
#         if len(current_chunk) + len(sentence) < max_chunk_size:
#             current_chunk += " " + sentence
#         else:
#             if current_chunk:
#                 chunks.append(current_chunk.strip())
#             current_chunk = sentence
    
#     if current_chunk:
#         chunks.append(current_chunk.strip())
    
#     # Add overlap between chunks for context
#     overlapped_chunks = []
#     for i, chunk in enumerate(chunks):
#         if i > 0 and overlap > 0:
#             prev_sentences = chunks[i-1].split('. ')
#             overlap_text = '. '.join(prev_sentences[-2:]) if len(prev_sentences) > 1 else chunks[i-1][-overlap:]
#             chunk = overlap_text + " " + chunk
#         overlapped_chunks.append(chunk)
    
#     return overlapped_chunks

# def generate_qa_pairs(chunk: str, num_questions: int = 2) -> List[Dict[str, str]]:
#     """Generate question-answer pairs from a text chunk."""
#     flashcards = []
    
#     # Skip chunks that are too short
#     words = chunk.split()
#     if len(words) < 20:
#         return []
    
#     try:
#         # Split into sentences to use as answers
#         sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20]
        
#         if len(sentences) < 1:
#             return []
        
#         # Generate questions for different sentences
#         for i in range(min(num_questions, len(sentences))):
#             answer = sentences[i]
            
#             # Skip very short answers
#             if len(answer.split()) < 3:
#                 continue
            
#             question = generate_questions(chunk, answer)
            
#             if question and question != answer:  # Make sure they're different
#                 flashcards.append({
#                     "question": question,
#                     "answer": answer,
#                     "context": chunk[:200] + "..." if len(chunk) > 200 else chunk
#                 })
                
#     except Exception as e:
#         print(f"Error generating QA: {e}")
    
#     return flashcards

# def process_pdf(pdf_file, questions_per_chunk: int = 2, max_chunks: int = 20):
#     """Main processing function."""
#     if pdf_file is None:
#         return "Please upload a PDF file.", "", "", "Your flashcards will appear here..."
    
#     try:
#         # Extract text
#         yield "πŸ“„ Extracting text from PDF...", "", "", "Processing..."
#         raw_text = extract_text_from_pdf(pdf_file)
        
#         if raw_text.startswith("Error"):
#             yield raw_text, "", "", "Error occurred"
#             return
        
#         if len(raw_text.strip()) < 100:
#             yield "PDF appears to be empty or contains no extractable text.", "", "", "Error occurred"
#             return
        
#         # Clean text
#         yield "🧹 Cleaning text...", "", "", "Processing..."
#         cleaned_text = clean_text(raw_text)
        
#         # Chunk text
#         yield "βœ‚οΈ Chunking text into sections...", "", "", "Processing..."
#         chunks = chunk_text(cleaned_text)
        
#         # Limit chunks for CPU performance
#         chunks = chunks[:max_chunks]
        
#         # Generate flashcards
#         all_flashcards = []
#         total_chunks = len(chunks)
        
#         for i, chunk in enumerate(chunks):
#             progress = f"🎴 Generating flashcards... ({i+1}/{total_chunks} chunks processed)"
#             yield progress, "", "", "Processing..."
            
#             cards = generate_qa_pairs(chunk, questions_per_chunk)
#             all_flashcards.extend(cards)
        
#         if not all_flashcards:
#             yield "Could not generate flashcards from this PDF. Try a PDF with more textual content.", "", "", "No flashcards generated"
#             return
        
#         # Format output
#         yield "βœ… Finalizing...", "", "", "Almost done..."
        
#         # Create formatted display
#         display_text = format_flashcards_display(all_flashcards)
        
#         # Create JSON download
#         json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False)
        
#         # Create Anki/CSV format
#         csv_lines = ["Question,Answer"]
#         for card in all_flashcards:
#             q = card['question'].replace('"', '""')
#             a = card['answer'].replace('"', '""')
#             csv_lines.append(f'"{q}","{a}"')
#         csv_output = "\n".join(csv_lines)
        
#         # FINAL OUTPUT - this updates all components
#         yield "βœ… Done! Generated {} flashcards".format(len(all_flashcards)), csv_output, json_output, display_text
        
#     except Exception as e:
#         error_msg = f"Error processing PDF: {str(e)}"
#         print(error_msg)
#         yield error_msg, "", "", error_msg

# def format_flashcards_display(flashcards: List[Dict]) -> str:
#     """Format flashcards for nice display."""
#     lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"]
    
#     for i, card in enumerate(flashcards, 1):
#         lines.append(f"### Card {i}")
#         lines.append(f"**Q:** {card['question']}")
#         lines.append(f"**A:** {card['answer']}")
#         lines.append(f"*Context: {card['context'][:100]}...*\n")
#         lines.append("---\n")
    
#     return "\n".join(lines)

# def create_sample_flashcard():
#     """Create a sample flashcard for demo purposes."""
#     sample = [{
#         "question": "What is the capital of France?",
#         "answer": "Paris is the capital and most populous city of France.",
#         "context": "Paris is the capital and most populous city of France..."
#     }]
#     return format_flashcards_display(sample)

# # Custom CSS for better styling
# custom_css = """
# .flashcard-container {
#     border: 2px solid #e0e0e0;
#     border-radius: 10px;
#     padding: 20px;
#     margin: 10px 0;
#     background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
#     color: white;
# }
# .question {
#     font-size: 1.2em;
#     font-weight: bold;
#     margin-bottom: 10px;
# }
# .answer {
#     font-size: 1em;
#     opacity: 0.9;
# }
# """

# # Gradio Interface
# with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
#     gr.Markdown("""
#     # πŸ“š PDF to Flashcards Generator
    
#     Upload any PDF document and automatically generate study flashcards (Q&A pairs) using AI.
    
#     **Features:**
#     - 🧠 Uses local CPU-friendly AI (no GPU needed)
#     - πŸ“„ Extracts text from any PDF
#     - βœ‚οΈ Intelligently chunks content
#     - 🎴 Generates question-answer pairs
#     - πŸ’Ύ Export to CSV (Anki-compatible) or JSON
    
#     *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.*
#     """)
    
#     with gr.Row():
#         with gr.Column(scale=1):
#             pdf_input = gr.File(
#                 label="Upload PDF",
#                 file_types=[".pdf"],
#                 type="filepath"
#             )
            
#             with gr.Row():
#                 questions_per_chunk = gr.Slider(
#                     minimum=1,
#                     maximum=5,
#                     value=2,
#                     step=1,
#                     label="Questions per section"
#                 )
#                 max_chunks = gr.Slider(
#                     minimum=5,
#                     maximum=50,
#                     value=20,
#                     step=5,
#                     label="Max sections to process"
#                 )
            
#             process_btn = gr.Button("πŸš€ Generate Flashcards", variant="primary")
            
#             gr.Markdown("""
#             ### πŸ’‘ Tips:
#             - Text-based PDFs work best (scanned images won't work)
#             - Academic papers and articles work great
#             - Adjust "Questions per section" based on content density
#             """)
        
#         with gr.Column(scale=2):
#             status_text = gr.Textbox(
#                 label="Status",
#                 value="Ready to process PDF...",
#                 interactive=False
#             )
            
#             output_display = gr.Markdown(
#                 label="Generated Flashcards",
#                 value="Your flashcards will appear here..."
#             )
    
#     with gr.Row():
#         with gr.Column():
#             csv_output = gr.Textbox(
#                 label="CSV Format (for Anki import)",
#                 lines=10,
#                 visible=True
#             )
#             gr.Markdown("*Copy the CSV content and save as `.csv` file to import into Anki*")
        
#         with gr.Column():
#             json_output = gr.Textbox(
#                 label="JSON Format",
#                 lines=10,
#                 visible=True
#             )
#             gr.Markdown("*Raw JSON data for custom applications*")
    
#     # FIXED: Direct binding without the broken .then() chain
#     process_btn.click(
#         fn=process_pdf,
#         inputs=[pdf_input, questions_per_chunk, max_chunks],
#         outputs=[status_text, csv_output, json_output, output_display]
#     )
    
#     # Example section
#     gr.Markdown("---")
#     gr.Markdown("### 🎯 Example Output Format")
#     gr.Markdown(create_sample_flashcard())

# if __name__ == "__main__":
#     demo.launch()














import gradio as gr
import PyPDF2
import re
import json
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import tempfile
import os

# Initialize the model and tokenizer directly
print("Loading models... This may take a minute on first run.")

model_name = "valhalla/t5-small-qg-hl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Set to evaluation mode and CPU
model.eval()
device = torch.device("cpu")
model.to(device)

def extract_key_phrases(text: str) -> List[str]:
    """Extract potential answer candidates from text."""
    # Look for noun phrases, named entities, and important concepts
    candidates = []
    
    # Pattern for capitalized words/phrases (potential named entities)
    capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
    candidates.extend(capitalized[:3])
    
    # Pattern for technical terms or concepts (words with specific patterns)
    # Look for phrases like "the process of X", "the concept of X", etc.
    concept_patterns = [
        r'(?:process|method|technique|approach|concept|theory|principle|system) of ([^,.]{10,50})',
        r'(?:known as|called|termed|referred to as) ([^,.]{5,40})',
        r'(?:is|are|was|were) (\w+(?:\s+\w+){1,4}) (?:that|which|who)',
    ]
    
    for pattern in concept_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        candidates.extend(matches[:2])
    
    # Clean and deduplicate
    candidates = [c.strip() for c in candidates if len(c.strip()) > 5]
    return list(dict.fromkeys(candidates))[:5]  # Remove duplicates, keep order

def generate_questions(context: str, answer: str, question_type: str = "what", max_length: int = 128) -> str:
    """Generate a question using T5 model with specified type."""
    try:
        # Format: "generate question: <hl> answer <hl> context"
        input_text = f"generate question: <hl> {answer} <hl> {context}"
        
        # Tokenize
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            max_length=512,
            truncation=True,
            padding=True
        ).to(device)
        
        # Generate with different parameters based on question type
        temperature = 0.7 if question_type == "what" else 0.85
        num_beams = 4 if question_type == "what" else 5
        
        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                num_beams=num_beams,
                early_stopping=True,
                do_sample=True,
                temperature=temperature
            )
        
        # Decode
        question = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Clean up
        question = re.sub(r'^(question:|q:)', '', question, flags=re.IGNORECASE).strip()
        
        # Post-process to improve question quality
        question = improve_question(question, answer, context, question_type)
        
        return question if len(question) > 10 else ""
        
    except Exception as e:
        print(f"Error generating question: {e}")
        return ""

def improve_question(question: str, answer: str, context: str, question_type: str) -> str:
    """Post-process generated questions to improve quality and add variety."""
    
    # Ensure question ends with question mark
    if not question.endswith('?'):
        question = question.rstrip('.') + '?'
    
    # Capitalize first letter
    question = question[0].upper() + question[1:] if question else question
    
    # Try to transform to why/how questions if specified
    if question_type == "why" and not question.lower().startswith("why"):
        # Try to convert to why question
        if re.search(r'\b(is|are|was|were|does|do|did)\b', question, re.IGNORECASE):
            question = create_why_question(question, answer, context)
    
    elif question_type == "how" and not question.lower().startswith("how"):
        # Try to convert to how question
        if re.search(r'\b(does|do|did|can|could)\b', question, re.IGNORECASE):
            question = create_how_question(question, answer, context)
    
    return question

def create_why_question(base_question: str, answer: str, context: str) -> str:
    """Transform or create a 'why' question."""
    
    # Look for causal indicators in the context
    causal_patterns = [
        r'because ([^,.]{10,60})',
        r'due to ([^,.]{10,60})',
        r'as a result of ([^,.]{10,60})',
        r'(?:leads to|causes|results in) ([^,.]{10,60})',
        r'in order to ([^,.]{10,60})'
    ]
    
    for pattern in causal_patterns:
        match = re.search(pattern, context, re.IGNORECASE)
        if match:
            # Extract the subject from context
            subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+(?:is|are|was|were|does|do)', context)
            if subject_match:
                subject = subject_match.group(1)
                return f"Why does {subject.lower()} occur?"
    
    # Fallback: create a generic why question
    # Extract main subject from answer
    words = answer.split()
    if len(words) > 3:
        return f"Why is {' '.join(words[:4])}... important?"
    
    return base_question

def create_how_question(base_question: str, answer: str, context: str) -> str:
    """Transform or create a 'how' question."""
    
    # Look for process indicators
    process_patterns = [
        r'(process|method|procedure|technique|approach) (?:of|for|to) ([^,.]{10,60})',
        r'by ([^,.]{10,60})',
        r'through ([^,.]{10,60})'
    ]
    
    for pattern in process_patterns:
        match = re.search(pattern, context, re.IGNORECASE)
        if match:
            if len(match.groups()) > 1:
                process = match.group(2)
                return f"How does {process.lower()} work?"
            else:
                process = match.group(1)
                return f"How is {process.lower()} achieved?"
    
    # Fallback: create a generic how question
    verbs = re.findall(r'\b(works?|functions?|operates?|performs?|executes?)\b', context, re.IGNORECASE)
    if verbs:
        subject_match = re.search(r'([A-Z][a-z]+(?:\s+[a-z]+){0,3})\s+' + verbs[0], context, re.IGNORECASE)
        if subject_match:
            subject = subject_match.group(1)
            return f"How does {subject.lower()} {verbs[0].lower()}?"
    
    return base_question

def extract_text_from_pdf(pdf_file) -> str:
    """Extract text from uploaded PDF file."""
    text = ""
    try:
        if isinstance(pdf_file, str):
            pdf_reader = PyPDF2.PdfReader(pdf_file)
        else:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
        
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    except Exception as e:
        return f"Error reading PDF: {str(e)}"
    
    return text

def clean_text(text: str) -> str:
    """Clean and preprocess extracted text."""
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep sentence structure
    text = re.sub(r'[^\w\s.,;!?-]', '', text)
    return text.strip()

def chunk_text(text: str, max_chunk_size: int = 512, overlap: int = 50) -> List[str]:
    """Split text into overlapping chunks for processing."""
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_chunk_size:
            current_chunk += " " + sentence
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    # Add overlap between chunks for context
    overlapped_chunks = []
    for i, chunk in enumerate(chunks):
        if i > 0 and overlap > 0:
            prev_sentences = chunks[i-1].split('. ')
            overlap_text = '. '.join(prev_sentences[-2:]) if len(prev_sentences) > 1 else chunks[i-1][-overlap:]
            chunk = overlap_text + " " + chunk
        overlapped_chunks.append(chunk)
    
    return overlapped_chunks

def generate_qa_pairs(chunk: str, num_questions: int = 3) -> List[Dict[str, str]]:
    """Generate question-answer pairs from a text chunk with variety."""
    flashcards = []
    
    # Skip chunks that are too short
    words = chunk.split()
    if len(words) < 20:
        return []
    
    try:
        # Extract key phrases for answers
        key_phrases = extract_key_phrases(chunk)
        
        # Also use sentences as potential answers
        sentences = [s.strip() for s in chunk.split('. ') if len(s.strip()) > 20]
        
        # Combine both sources
        answer_candidates = key_phrases + sentences[:2]
        
        if len(answer_candidates) < 1:
            return []
        
        # Define question types to generate
        question_types = ["what", "why", "how"]
        
        # Generate diverse questions
        questions_generated = 0
        for i, answer in enumerate(answer_candidates):
            if questions_generated >= num_questions:
                break
            
            # Skip very short answers
            if len(answer.split()) < 3:
                continue
            
            # Cycle through question types
            q_type = question_types[i % len(question_types)]
            
            question = generate_questions(chunk, answer, question_type=q_type)
            
            if question and question != answer:  # Make sure they're different
                flashcards.append({
                    "question": question,
                    "answer": answer,
                    "context": chunk[:200] + "..." if len(chunk) > 200 else chunk,
                    "type": q_type
                })
                questions_generated += 1
                
    except Exception as e:
        print(f"Error generating QA: {e}")
    
    return flashcards

def process_pdf(pdf_file, questions_per_chunk: int = 3, max_chunks: int = 20):
    """Main processing function."""
    if pdf_file is None:
        return "Please upload a PDF file.", "", "", "Your flashcards will appear here..."
    
    try:
        # Extract text
        yield "πŸ“„ Extracting text from PDF...", "", "", "Processing..."
        raw_text = extract_text_from_pdf(pdf_file)
        
        if raw_text.startswith("Error"):
            yield raw_text, "", "", "Error occurred"
            return
        
        if len(raw_text.strip()) < 100:
            yield "PDF appears to be empty or contains no extractable text.", "", "", "Error occurred"
            return
        
        # Clean text
        yield "🧹 Cleaning text...", "", "", "Processing..."
        cleaned_text = clean_text(raw_text)
        
        # Chunk text
        yield "βœ‚οΈ Chunking text into sections...", "", "", "Processing..."
        chunks = chunk_text(cleaned_text)
        
        # Limit chunks for CPU performance
        chunks = chunks[:max_chunks]
        
        # Generate flashcards
        all_flashcards = []
        total_chunks = len(chunks)
        
        for i, chunk in enumerate(chunks):
            progress = f"🎴 Generating flashcards... ({i+1}/{total_chunks} chunks processed)"
            yield progress, "", "", "Processing..."
            
            cards = generate_qa_pairs(chunk, questions_per_chunk)
            all_flashcards.extend(cards)
        
        if not all_flashcards:
            yield "Could not generate flashcards from this PDF. Try a PDF with more textual content.", "", "", "No flashcards generated"
            return
        
        # Format output
        yield "βœ… Finalizing...", "", "", "Almost done..."
        
        # Create formatted display
        display_text = format_flashcards_display(all_flashcards)
        
        # Create JSON download
        json_output = json.dumps(all_flashcards, indent=2, ensure_ascii=False)
        
        # Create Anki/CSV format
        csv_lines = ["Question,Answer,Type"]
        for card in all_flashcards:
            q = card['question'].replace('"', '""')
            a = card['answer'].replace('"', '""')
            t = card.get('type', 'what')
            csv_lines.append(f'"{q}","{a}","{t}"')
        csv_output = "\n".join(csv_lines)
        
        # FINAL OUTPUT - this updates all components
        stats = f"βœ… Done! Generated {len(all_flashcards)} flashcards ("
        types_count = {}
        for card in all_flashcards:
            t = card.get('type', 'what')
            types_count[t] = types_count.get(t, 0) + 1
        stats += ", ".join([f"{count} {qtype}" for qtype, count in types_count.items()]) + ")"
        
        yield stats, csv_output, json_output, display_text
        
    except Exception as e:
        error_msg = f"Error processing PDF: {str(e)}"
        print(error_msg)
        yield error_msg, "", "", error_msg

def format_flashcards_display(flashcards: List[Dict]) -> str:
    """Format flashcards for nice display."""
    lines = [f"## 🎴 Generated {len(flashcards)} Flashcards\n"]
    
    # Count by type
    types_count = {}
    for card in flashcards:
        t = card.get('type', 'what')
        types_count[t] = types_count.get(t, 0) + 1
    
    lines.append(f"**Breakdown:** {', '.join([f'{count} {qtype.upper()}' for qtype, count in types_count.items()])}\n")
    lines.append("---\n")
    
    for i, card in enumerate(flashcards, 1):
        qtype = card.get('type', 'what').upper()
        emoji = "❓" if qtype == "WHAT" else "πŸ€”" if qtype == "WHY" else "πŸ”§"
        
        lines.append(f"### {emoji} Card {i} - {qtype}")
        lines.append(f"**Q:** {card['question']}")
        lines.append(f"**A:** {card['answer']}")
        lines.append(f"*Context: {card['context'][:100]}...*\n")
        lines.append("---\n")
    
    return "\n".join(lines)

def create_sample_flashcard():
    """Create a sample flashcard for demo purposes."""
    sample = [
        {
            "question": "What is photosynthesis?",
            "answer": "Photosynthesis is the process by which plants convert sunlight into energy.",
            "context": "Photosynthesis is the process by which plants convert sunlight into energy...",
            "type": "what"
        },
        {
            "question": "Why do plants need chlorophyll?",
            "answer": "Chlorophyll absorbs light energy needed for photosynthesis.",
            "context": "Chlorophyll absorbs light energy needed for photosynthesis...",
            "type": "why"
        },
        {
            "question": "How do plants convert light into chemical energy?",
            "answer": "Through the process of photosynthesis in the chloroplasts.",
            "context": "Through the process of photosynthesis in the chloroplasts...",
            "type": "how"
        }
    ]
    return format_flashcards_display(sample)

# Custom CSS for better styling
custom_css = """
.flashcard-container {
    border: 2px solid #e0e0e0;
    border-radius: 10px;
    padding: 20px;
    margin: 10px 0;
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
}
.question {
    font-size: 1.2em;
    font-weight: bold;
    margin-bottom: 10px;
}
.answer {
    font-size: 1em;
    opacity: 0.9;
}
"""

# Gradio Interface
with gr.Blocks(css=custom_css, title="PDF to Flashcards") as demo:
    gr.Markdown("""
    # πŸ“š PDF to Flashcards Generator (Enhanced)
    
    Upload any PDF document and automatically generate study flashcards with **What, Why, and How** questions using AI.
    
    **✨ New Features:**
    - 🎯 Generates **What** questions (factual)
    - πŸ€” Generates **Why** questions (reasoning)
    - πŸ”§ Generates **How** questions (process)
    - πŸ“Š Improved question quality and variety
    - 🧠 Better answer extraction
    
    **Core Features:**
    - 🧠 Uses local CPU-friendly AI (no GPU needed)
    - πŸ“„ Extracts text from any PDF
    - βœ‚οΈ Intelligently chunks content
    - 🎴 Generates diverse question-answer pairs
    - πŸ’Ύ Export to CSV (Anki-compatible) or JSON
    
    *Note: Processing is done entirely on CPU, so large PDFs may take a few minutes.*
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            pdf_input = gr.File(
                label="Upload PDF",
                file_types=[".pdf"],
                type="filepath"
            )
            
            with gr.Row():
                questions_per_chunk = gr.Slider(
                    minimum=1,
                    maximum=6,
                    value=3,
                    step=1,
                    label="Questions per section"
                )
                max_chunks = gr.Slider(
                    minimum=5,
                    maximum=50,
                    value=20,
                    step=5,
                    label="Max sections to process"
                )
            
            process_btn = gr.Button("πŸš€ Generate Flashcards", variant="primary")
            
            gr.Markdown("""
            ### πŸ’‘ Tips:
            - Text-based PDFs work best (scanned images won't work)
            - Academic papers and articles work great
            - Adjust "Questions per section" for more variety
            - Higher questions per section = more Why/How questions
            """)
        
        with gr.Column(scale=2):
            status_text = gr.Textbox(
                label="Status",
                value="Ready to process PDF...",
                interactive=False
            )
            
            output_display = gr.Markdown(
                label="Generated Flashcards",
                value="Your flashcards will appear here..."
            )
    
    with gr.Row():
        with gr.Column():
            csv_output = gr.Textbox(
                label="CSV Format (for Anki import)",
                lines=10,
                visible=True
            )
            gr.Markdown("*Copy the CSV content and save as `.csv` file to import into Anki*")
        
        with gr.Column():
            json_output = gr.Textbox(
                label="JSON Format",
                lines=10,
                visible=True
            )
            gr.Markdown("*Raw JSON data for custom applications*")
    
    # Direct binding
    process_btn.click(
        fn=process_pdf,
        inputs=[pdf_input, questions_per_chunk, max_chunks],
        outputs=[status_text, csv_output, json_output, output_display]
    )
    
    # Example section
    gr.Markdown("---")
    gr.Markdown("### 🎯 Example Output Format")
    gr.Markdown(create_sample_flashcard())

if __name__ == "__main__":
    demo.launch()