prat1003 commited on
Commit
9153df0
·
verified ·
1 Parent(s): bc99bb8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -15
app.py CHANGED
@@ -4,12 +4,17 @@ import pytesseract
4
  from transformers import pipeline
5
  import json
6
  import tempfile
 
7
  import os
8
 
9
- # Load question generation model
10
- qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-small-qg-prepend", tokenizer="t5-small")
 
 
 
 
11
 
12
- # OCR function
13
  def extract_text_from_scanned_pdf(file_path):
14
  pages = convert_from_path(file_path)
15
  text = ""
@@ -17,24 +22,25 @@ def extract_text_from_scanned_pdf(file_path):
17
  text += pytesseract.image_to_string(page)
18
  return text.strip()
19
 
20
- # Main function
21
  def process_pdf(pdf_file):
22
- # Step 1: Save uploaded PDF
23
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
24
- temp_pdf.write(pdf_file.read())
25
  temp_pdf_path = temp_pdf.name
26
 
27
- # Step 2: OCR extraction
28
  extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
29
  os.remove(temp_pdf_path)
30
 
31
  if not extracted_text.strip():
32
- return "❌ Could not extract text. Make sure the PDF has readable content."
33
 
34
- # Step 3: Generate questions
35
- questions_output = qg_pipeline("generate questions: " + extracted_text[:1000], max_length=128, num_return_sequences=3)
 
36
 
37
- # Step 4: Convert to <questiondata> XML
38
  question_list = []
39
  for q in questions_output:
40
  question_list.append({
@@ -47,6 +53,7 @@ def process_pdf(pdf_file):
47
  ]
48
  })
49
 
 
50
  data = {
51
  "title": "Certification Title",
52
  "totalmarks": "50",
@@ -59,16 +66,17 @@ def process_pdf(pdf_file):
59
  "maxattempts": 3
60
  }
61
 
 
62
  xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
63
  return xml_output
64
 
65
- # Gradio interface
66
  iface = gr.Interface(
67
  fn=process_pdf,
68
- inputs=gr.File(label="Upload your scanned PDF"),
69
  outputs="text",
70
- title="📄 PDF to Question Generator (with OCR)",
71
- description="Uploads a scanned PDF, runs OCR, and generates <questiondata> XML output for your quiz system."
72
  )
73
 
74
  iface.launch()
 
4
  from transformers import pipeline
5
  import json
6
  import tempfile
7
+ import shutil
8
  import os
9
 
10
+ # 🧠 Load lightweight question generation model
11
+ qg_pipeline = pipeline(
12
+ "text2text-generation",
13
+ model="valhalla/t5-small-qg-prepend",
14
+ tokenizer="t5-small"
15
+ )
16
 
17
+ # 🧩 OCR function: extract text from scanned PDFs
18
  def extract_text_from_scanned_pdf(file_path):
19
  pages = convert_from_path(file_path)
20
  text = ""
 
22
  text += pytesseract.image_to_string(page)
23
  return text.strip()
24
 
25
+ # ⚙️ Main processing function
26
  def process_pdf(pdf_file):
27
+ # Step 1️⃣: Copy uploaded file to a temporary location
28
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
29
+ shutil.copy(pdf_file.name, temp_pdf.name)
30
  temp_pdf_path = temp_pdf.name
31
 
32
+ # Step 2️⃣: Extract text using OCR
33
  extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
34
  os.remove(temp_pdf_path)
35
 
36
  if not extracted_text.strip():
37
+ return "❌ Could not extract text. Make sure the PDF has readable text."
38
 
39
+ # Step 3️⃣: Generate questions from extracted text
40
+ prompt = "generate questions: " + extracted_text[:1000] # limit to 1000 chars
41
+ questions_output = qg_pipeline(prompt, max_length=128, num_return_sequences=3)
42
 
43
+ # Step 4️⃣: Convert model output into question list
44
  question_list = []
45
  for q in questions_output:
46
  question_list.append({
 
53
  ]
54
  })
55
 
56
+ # Step 5️⃣: Build the <questiondata> structure
57
  data = {
58
  "title": "Certification Title",
59
  "totalmarks": "50",
 
66
  "maxattempts": 3
67
  }
68
 
69
+ # Step 6️⃣: Wrap JSON inside XML CDATA
70
  xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
71
  return xml_output
72
 
73
+ # 🚀 Gradio Web UI
74
  iface = gr.Interface(
75
  fn=process_pdf,
76
+ inputs=gr.File(label="📄 Upload your scanned PDF"),
77
  outputs="text",
78
+ title="PDF to Question Generator (with OCR)",
79
+ description="Uploads a scanned PDF, extracts text via OCR, and generates <questiondata> XML for quiz integration."
80
  )
81
 
82
  iface.launch()