prat1003 commited on
Commit
2085bbf
·
verified ·
1 Parent(s): e7b5f58

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -41
app.py CHANGED
@@ -1,20 +1,43 @@
1
  import gradio as gr
2
- from pdf2image import convert_from_path
3
- import pytesseract
4
- from transformers import pipeline
5
- import json
6
  import tempfile
7
  import shutil
8
  import os
9
-
10
-
11
- import easyocr
12
  import numpy as np
 
 
 
 
13
 
 
 
 
14
  reader = easyocr.Reader(['en'])
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def extract_text_from_scanned_pdf(file_path):
17
- pages = convert_from_path(file_path)
 
18
  text = ""
19
  for page in pages:
20
  img_array = np.array(page)
@@ -22,43 +45,37 @@ def extract_text_from_scanned_pdf(file_path):
22
  text += " ".join(result) + "\n"
23
  return text.strip()
24
 
25
-
26
-
27
-
28
- # 🧠 Load lightweight question generation model
29
- qg_pipeline = pipeline(
30
- "text2text-generation",
31
- model="valhalla/t5-small-qg-prepend",
32
- tokenizer="t5-small"
33
- )
34
-
35
- # 🧩 OCR function: extract text from scanned PDFs
36
- #def extract_text_from_scanned_pdf(file_path):
37
- # pages = convert_from_path(file_path)
38
- # text = ""
39
- # for page in pages:
40
- # text += pytesseract.image_to_string(page)
41
- # return text.strip()
42
-
43
- # ⚙️ Main processing function
44
  def process_pdf(pdf_file):
45
- # Step 1️⃣: Copy uploaded file to a temporary location
46
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
47
  shutil.copy(pdf_file.name, temp_pdf.name)
48
  temp_pdf_path = temp_pdf.name
49
 
50
- # Step 2️⃣: Extract text using OCR
51
- extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
52
- os.remove(temp_pdf_path)
53
 
 
54
  if not extracted_text.strip():
55
- return "❌ Could not extract text. Make sure the PDF has readable text."
56
 
57
- # Step 3️⃣: Generate questions from extracted text
58
- prompt = "generate questions: " + extracted_text[:1000] # limit to 1000 chars
59
- questions_output = qg_pipeline(prompt, max_length=128, num_return_sequences=3)
60
 
61
- # Step 4️⃣: Convert model output into question list
 
 
 
 
 
 
 
 
 
 
 
 
62
  question_list = []
63
  for q in questions_output:
64
  question_list.append({
@@ -71,7 +88,7 @@ def process_pdf(pdf_file):
71
  ]
72
  })
73
 
74
- # Step 5️⃣: Build the <questiondata> structure
75
  data = {
76
  "title": "Certification Title",
77
  "totalmarks": "50",
@@ -84,17 +101,19 @@ def process_pdf(pdf_file):
84
  "maxattempts": 3
85
  }
86
 
87
- # Step 6️⃣: Wrap JSON inside XML CDATA
88
  xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
89
  return xml_output
90
 
91
- # 🚀 Gradio Web UI
 
 
92
  iface = gr.Interface(
93
  fn=process_pdf,
94
- inputs=gr.File(label="📄 Upload your scanned PDF"),
95
  outputs="text",
96
  title="PDF to Question Generator (with OCR)",
97
- description="Uploads a scanned PDF, extracts text via OCR, and generates <questiondata> XML for quiz integration."
98
  )
99
 
100
  iface.launch()
 
1
  import gradio as gr
 
 
 
 
2
  import tempfile
3
  import shutil
4
  import os
5
+ import json
 
 
6
  import numpy as np
7
+ from pdf2image import convert_from_path
8
+ import easyocr
9
+ from PyPDF2 import PdfReader
10
+ from transformers import pipeline
11
 
12
+ # -----------------------------
13
+ # Initialize OCR and Transformers
14
+ # -----------------------------
15
  reader = easyocr.Reader(['en'])
16
 
17
+ qg_pipeline = pipeline(
18
+ "text2text-generation",
19
+ model="valhalla/t5-small-qg-prepend",
20
+ tokenizer="t5-small"
21
+ )
22
+
23
+ # -----------------------------
24
+ # Extract text from selectable PDFs
25
+ # -----------------------------
26
+ def extract_text_from_pdf(file_path):
27
+ reader_pdf = PdfReader(file_path)
28
+ text = ""
29
+ for page in reader_pdf.pages:
30
+ t = page.extract_text()
31
+ if t:
32
+ text += t + "\n"
33
+ return text.strip()
34
+
35
+ # -----------------------------
36
+ # Extract text from scanned PDFs using EasyOCR
37
+ # -----------------------------
38
  def extract_text_from_scanned_pdf(file_path):
39
+ # Reduce DPI for faster processing
40
+ pages = convert_from_path(file_path, dpi=150)
41
  text = ""
42
  for page in pages:
43
  img_array = np.array(page)
 
45
  text += " ".join(result) + "\n"
46
  return text.strip()
47
 
48
+ # -----------------------------
49
+ # Main processing function
50
+ # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def process_pdf(pdf_file):
52
+ # Save uploaded PDF to temp file
53
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
54
  shutil.copy(pdf_file.name, temp_pdf.name)
55
  temp_pdf_path = temp_pdf.name
56
 
57
+ # Step 1: Try extracting text from PDF directly
58
+ extracted_text = extract_text_from_pdf(temp_pdf_path)
 
59
 
60
+ # Step 2: If empty, use OCR
61
  if not extracted_text.strip():
62
+ extracted_text = extract_text_from_scanned_pdf(temp_pdf_path)
63
 
64
+ os.remove(temp_pdf_path)
 
 
65
 
66
+ if not extracted_text.strip():
67
+ return "❌ Could not extract text. Make sure the PDF has readable content."
68
+
69
+ # Step 3: Generate questions with beam search (3 questions)
70
+ prompt = "generate questions: " + extracted_text[:1000] # limit to first 1000 chars
71
+ questions_output = qg_pipeline(
72
+ prompt,
73
+ max_length=128,
74
+ num_beams=3, # beam search
75
+ num_return_sequences=3
76
+ )
77
+
78
+ # Step 4: Build question list
79
  question_list = []
80
  for q in questions_output:
81
  question_list.append({
 
88
  ]
89
  })
90
 
91
+ # Step 5: Build <questiondata> structure
92
  data = {
93
  "title": "Certification Title",
94
  "totalmarks": "50",
 
101
  "maxattempts": 3
102
  }
103
 
104
+ # Step 6: Wrap JSON in XML CDATA
105
  xml_output = "<questiondata><![CDATA[" + json.dumps(data) + "]]></questiondata>"
106
  return xml_output
107
 
108
+ # -----------------------------
109
+ # Gradio Interface
110
+ # -----------------------------
111
  iface = gr.Interface(
112
  fn=process_pdf,
113
+ inputs=gr.File(label="📄 Upload your PDF"),
114
  outputs="text",
115
  title="PDF to Question Generator (with OCR)",
116
+ description="Uploads a PDF, extracts text (or OCR for scanned PDFs), and generates <questiondata> XML for quizzes."
117
  )
118
 
119
  iface.launch()