prat1003 commited on
Commit
719feee
·
verified ·
1 Parent(s): 6577daa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -26
app.py CHANGED
@@ -1,33 +1,53 @@
1
  import gradio as gr
2
- from PyPDF2 import PdfReader
 
3
  from transformers import pipeline
4
  import json
 
 
5
 
6
- def generate_questions(pdf_file):
7
- # Step 1: Extract text
8
- reader = PdfReader(pdf_file.name)
 
 
 
9
  text = ""
10
- for page in reader.pages:
11
- text += page.extract_text() + "\n"
12
-
13
- # Step 2: Hugging Face QG model
14
- qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-small-qg-prepend")
15
- questions = qg_pipeline(f"generate questions: {text[:2000]}") # Demo: first 2000 chars
16
-
17
- # Step 3: Build XML
18
- generated_questions = []
19
- for q in questions:
20
- generated_questions.append({
21
- "questiontext": q['generated_text'],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  "questiontype": "single_select",
23
  "marks": 10,
24
  "options": [
25
- {"optiontext": "Answer1", "score": "10"},
26
- {"optiontext": "Answer2", "score": "0"}
27
  ]
28
  })
29
 
30
- questiondata_json = json.dumps({
31
  "title": "Certification Title",
32
  "totalmarks": "50",
33
  "time": "20",
@@ -35,18 +55,20 @@ def generate_questions(pdf_file):
35
  "failurl": "",
36
  "passurl": "",
37
  "sendpassemail": True,
38
- "questions": json.dumps({"questions": generated_questions}),
39
  "maxattempts": 3
40
- })
41
 
42
- xml_output = f'<questiondata><![CDATA[{questiondata_json}]]></questiondata>'
43
  return xml_output
44
 
45
- # Gradio UI
46
  iface = gr.Interface(
47
- fn=generate_questions,
48
- inputs=gr.File(file_types=['.pdf']),
49
- outputs=gr.Textbox(label="Generated XML")
 
 
50
  )
51
 
52
  iface.launch()
 
1
  import gradio as gr
2
+ from pdf2image import convert_from_path
3
+ import pytesseract
4
  from transformers import pipeline
5
  import json
6
+ import tempfile
7
+ import os
8
 
9
+ # Load question generation model
10
+ qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-small-qg-prepend")
11
+
12
+ # OCR function
13
+ def extract_text_from_scanned_pdf(file_path):
14
+ pages = convert_from_path(file_path)
15
  text = ""
16
+ for page in pages:
17
+ text += pytesseract.image_to_string(page)
18
+ return text.strip()
19
+
20
+ # Main function
21
+ def process_pdf(pdf_file):
22
+ # Step 1: Save uploaded PDF
23
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
24
+ temp_pdf.write(pdf_file.read())
25
+ temp_pdf_path = temp_pdf.name
26
+
27
+ # Step 2: OCR extraction
28
+ extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
29
+ os.remove(temp_pdf_path)
30
+
31
+ if not extracted_text.strip():
32
+ return "❌ Could not extract text. Make sure the PDF has readable content."
33
+
34
+ # Step 3: Generate questions
35
+ questions_output = qg_pipeline("generate questions: " + extracted_text[:1000], max_length=128, num_return_sequences=3)
36
+
37
+ # Step 4: Convert to <questiondata> XML
38
+ question_list = []
39
+ for q in questions_output:
40
+ question_list.append({
41
+ "questiontext": q["generated_text"],
42
  "questiontype": "single_select",
43
  "marks": 10,
44
  "options": [
45
+ {"optiontext": "Option 1", "score": "10"},
46
+ {"optiontext": "Option 2", "score": "0"}
47
  ]
48
  })
49
 
50
+ data = {
51
  "title": "Certification Title",
52
  "totalmarks": "50",
53
  "time": "20",
 
55
  "failurl": "",
56
  "passurl": "",
57
  "sendpassemail": True,
58
+ "questions": json.dumps({"questions": question_list}),
59
  "maxattempts": 3
60
+ }
61
 
62
+ xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
63
  return xml_output
64
 
65
+ # Gradio interface
66
  iface = gr.Interface(
67
+ fn=process_pdf,
68
+ inputs=gr.File(label="Upload your scanned PDF"),
69
+ outputs="text",
70
+ title="📄 PDF to Question Generator (with OCR)",
71
+ description="Uploads a scanned PDF, runs OCR, and generates <questiondata> XML output for your quiz system."
72
  )
73
 
74
  iface.launch()