NNEngine commited on
Commit
0712e1f
·
1 Parent(s): da03326

Initial PDF evaluator app

Browse files
Files changed (2) hide show
  1. app.py +163 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from pypdf import PdfReader
4
+ from PIL import Image
5
+ import io
6
+
7
+ from transformers import (
8
+ TrOCRProcessor,
9
+ VisionEncoderDecoderModel,
10
+ AutoTokenizer,
11
+ AutoModelForCausalLM
12
+ )
13
+
14
+ # ============================================================
15
+ # Device
16
+ # ============================================================
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
18
+
19
+ # ============================================================
20
+ # Load Models (cached by HF Spaces)
21
+ # ============================================================
22
+ ocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
23
+ ocr_model = VisionEncoderDecoderModel.from_pretrained(
24
+ "microsoft/trocr-base-printed"
25
+ ).to(device)
26
+
27
+ tokenizer = AutoTokenizer.from_pretrained(
28
+ "Qwen/Qwen2.5-1.5B-Instruct",
29
+ trust_remote_code=True
30
+ )
31
+
32
+ qwen_model = AutoModelForCausalLM.from_pretrained(
33
+ "Qwen/Qwen2.5-1.5B-Instruct",
34
+ device_map="auto",
35
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
36
+ trust_remote_code=True
37
+ )
38
+
39
+ # ============================================================
40
+ # Helpers
41
+ # ============================================================
42
+ def is_scanned_pdf(reader):
43
+ for page in reader.pages:
44
+ if page.extract_text():
45
+ return False
46
+ return True
47
+
48
+ def extract_text_from_pdf(file):
49
+ reader = PdfReader(file)
50
+ scanned = is_scanned_pdf(reader)
51
+
52
+ extracted_text = []
53
+
54
+ if not scanned:
55
+ # Digital PDF
56
+ for page in reader.pages:
57
+ text = page.extract_text()
58
+ if text:
59
+ extracted_text.append(text)
60
+
61
+ else:
62
+ # OCR only embedded images (HF-safe)
63
+ for page in reader.pages:
64
+ if "/XObject" in page["/Resources"]:
65
+ xobjects = page["/Resources"]["/XObject"].get_object()
66
+ for obj in xobjects:
67
+ xobj = xobjects[obj]
68
+ if xobj["/Subtype"] == "/Image":
69
+ image = Image.open(io.BytesIO(xobj.get_data())).convert("RGB")
70
+ pixel_values = ocr_processor(
71
+ images=image,
72
+ return_tensors="pt"
73
+ ).pixel_values.to(device)
74
+
75
+ with torch.no_grad():
76
+ ids = ocr_model.generate(pixel_values)
77
+
78
+ text = ocr_processor.batch_decode(
79
+ ids,
80
+ skip_special_tokens=True
81
+ )[0]
82
+ extracted_text.append(text)
83
+
84
+ return "\n\n".join(extracted_text)
85
+
86
+ def evaluate_text(text):
87
+ prompt = f"""
88
+ You are a strict academic evaluator.
89
+
90
+ Evaluate the following document and assign marks out of 10.
91
+
92
+ Criteria:
93
+ - Clarity
94
+ - Structure
95
+ - Technical depth
96
+ - Language quality
97
+ - Completeness
98
+
99
+ DOCUMENT:
100
+ ---------
101
+ {text[:6000]}
102
+ ---------
103
+
104
+ Respond strictly in this format:
105
+
106
+ Score: X/10
107
+ Justification:
108
+ Strengths:
109
+ Weaknesses:
110
+ """
111
+
112
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
113
+
114
+ with torch.no_grad():
115
+ output = qwen_model.generate(
116
+ **inputs,
117
+ max_new_tokens=400,
118
+ do_sample=False
119
+ )
120
+
121
+ return tokenizer.decode(output[0], skip_special_tokens=True)
122
+
123
+ # ============================================================
124
+ # Gradio Function
125
+ # ============================================================
126
+ def process_pdf(pdf_file):
127
+ extracted_text = extract_text_from_pdf(pdf_file)
128
+ evaluation = evaluate_text(extracted_text)
129
+
130
+ return extracted_text, evaluation
131
+
132
+ # ============================================================
133
+ # Gradio UI
134
+ # ============================================================
135
+ with gr.Blocks(title="PDF Evaluator (OCR + Qwen)") as demo:
136
+ gr.Markdown("""
137
+ # 📄 PDF Evaluator
138
+ Upload a PDF to:
139
+ - Extract text (OCR if needed)
140
+ - Evaluate content using Qwen
141
+ - Get marks out of 10
142
+ """)
143
+
144
+ pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
145
+ extract_btn = gr.Button("Extract & Evaluate")
146
+
147
+ extracted_output = gr.Textbox(
148
+ label="Extracted Text",
149
+ lines=20
150
+ )
151
+
152
+ evaluation_output = gr.Textbox(
153
+ label="Evaluation",
154
+ lines=10
155
+ )
156
+
157
+ extract_btn.click(
158
+ process_pdf,
159
+ inputs=pdf_input,
160
+ outputs=[extracted_output, evaluation_output]
161
+ )
162
+
163
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ accelerate
4
+ pypdf
5
+ pillow
6
+ gradio
7
+ sentencepiece