c-ho commited on
Commit
47acc6b
·
verified ·
1 Parent(s): 2b812f0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +274 -0
app.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================================================
2
+ # app.py (FINAL OCR + CLASSIFICATION PIPELINE)
3
+ # =========================================================
4
+
5
+ import gradio as gr
6
+ from transformers import pipeline
7
+ from pypdf import PdfReader
8
+ from pdf2image import convert_from_path
9
+ import pytesseract
10
+ import tempfile
11
+
12
+ # =========================================================
13
+ # Available Models
14
+ # =========================================================
15
+
16
+ MODELS = {
17
+ "English model (ubffm/academic_text_classifier_en)": "ubffm/academic_text_classifier_en",
18
+ "German model (ubffm/academic_text_classifier_de)": "ubffm/academic_text_classifier_de",
19
+ }
20
+
21
+ DEFAULT_MODEL = "English model (ubffm/academic_text_classifier_en)"
22
+
23
+ # =========================================================
24
+ # Example Text
25
+ # =========================================================
26
+
27
+ EXAMPLE_TEXT = """
28
+ Microsoft Word - 08-Zimmermann-ISIS6-final.doc
29
+ Contrastive Focus
30
+ Malte Zimmermann
31
+ Humboldt University
32
+ The article puts forward a discourse-pragmatic approach...
33
+ """
34
+
35
+ # =========================================================
36
+ # Labels
37
+ # =========================================================
38
+
39
+ LABELS = [
40
+ "OUT OF SCOPE",
41
+ "MAIN TEXT",
42
+ "EXAMPLE",
43
+ "REFERENCE"
44
+ ]
45
+
46
+ DEFAULT_NOISE = ["OUT OF SCOPE", "REFERENCE"]
47
+
48
+ # =========================================================
49
+ # Pipeline cache
50
+ # =========================================================
51
+
52
+ PIPELINES = {}
53
+
54
+ def get_classifier(model_display_name):
55
+ model_name = MODELS[model_display_name]
56
+
57
+ if model_name not in PIPELINES:
58
+ PIPELINES[model_name] = pipeline(
59
+ "text-classification",
60
+ model=model_name,
61
+ tokenizer=model_name,
62
+ return_all_scores=True
63
+ )
64
+
65
+ return PIPELINES[model_name]
66
+
67
+ # =========================================================
68
+ # Prediction helper
69
+ # =========================================================
70
+
71
+ def get_best_prediction(classifier, text):
72
+ result = classifier(text)
73
+
74
+ if isinstance(result, list) and len(result) > 0:
75
+ if isinstance(result[0], list):
76
+ result = result[0]
77
+
78
+ best = max(result, key=lambda x: x["score"])
79
+ return best, result
80
+
81
+ # =========================================================
82
+ # Clean empty lines
83
+ # =========================================================
84
+
85
+ def normalize_empty_lines(lines):
86
+ cleaned = []
87
+ prev_empty = False
88
+
89
+ for line in lines:
90
+ empty = not line.strip()
91
+ if empty and prev_empty:
92
+ continue
93
+ cleaned.append(line)
94
+ prev_empty = empty
95
+
96
+ return cleaned
97
+
98
+ # =========================================================
99
+ # TEXT processing
100
+ # =========================================================
101
+
102
+ def process_text_input(text, noise_labels, selected_model):
103
+
104
+ if not text.strip():
105
+ return "", "", "", None
106
+
107
+ classifier = get_classifier(selected_model)
108
+ lines = text.splitlines()
109
+
110
+ kept, removed, logs = [], [], []
111
+
112
+ for i, line in enumerate(lines, 1):
113
+
114
+ if not line.strip():
115
+ kept.append("")
116
+ continue
117
+
118
+ pred, _ = get_best_prediction(classifier, line)
119
+
120
+ logs.append(f"Line {i} | {pred['label']} ({pred['score']:.4f})\n{line}\n")
121
+
122
+ if pred["label"] in noise_labels:
123
+ removed.append(line)
124
+ else:
125
+ kept.append(line)
126
+
127
+ kept = normalize_empty_lines(kept)
128
+ filtered = "\n".join(kept)
129
+
130
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8")
131
+ tmp.write(filtered)
132
+ tmp.close()
133
+
134
+ stats = (
135
+ f"Model: {selected_model}\n"
136
+ f"Total lines: {len(lines)}\n"
137
+ f"Removed: {len(removed)}\n"
138
+ f"Remaining: {len(kept)}"
139
+ )
140
+
141
+ return "\n".join(logs), filtered, stats, tmp.name
142
+
143
+ # =========================================================
144
+ # TXT file processing
145
+ # =========================================================
146
+
147
+ def process_document_file(file, noise_labels, selected_model):
148
+
149
+ if file is None:
150
+ return "", "", "", None
151
+
152
+ with open(file.name, "r", encoding="utf-8") as f:
153
+ text = f.read()
154
+
155
+ return process_text_input(text, noise_labels, selected_model)
156
+
157
+ # =========================================================
158
+ # PDF OCR + extraction
159
+ # =========================================================
160
+
161
+ def extract_text_from_pdf(pdf_file):
162
+
163
+ text_parts = []
164
+
165
+ # 1. Try digital PDF extraction
166
+ try:
167
+ reader = PdfReader(pdf_file.name)
168
+
169
+ for page in reader.pages:
170
+ txt = page.extract_text()
171
+ if txt and txt.strip():
172
+ text_parts.append(txt)
173
+
174
+ except:
175
+ pass
176
+
177
+ text = "\n".join(text_parts).strip()
178
+
179
+ # 2. If empty → OCR fallback
180
+ if not text:
181
+
182
+ pages = convert_from_path(pdf_file.name, dpi=300)
183
+
184
+ ocr_text = []
185
+ for page in pages:
186
+ ocr_text.append(pytesseract.image_to_string(page))
187
+
188
+ text = "\n".join(ocr_text)
189
+
190
+ return text
191
+
192
+ # =========================================================
193
+ # PDF processing
194
+ # =========================================================
195
+
196
+ def process_pdf_file(file, noise_labels, selected_model):
197
+
198
+ if file is None:
199
+ return "", "", "", None
200
+
201
+ text = extract_text_from_pdf(file)
202
+
203
+ return process_text_input(text, noise_labels, selected_model)
204
+
205
+ # =========================================================
206
+ # UI
207
+ # =========================================================
208
+
209
+ with gr.Blocks(title="Academic Text Noise Filter") as demo:
210
+
211
+ gr.Markdown("""
212
+ # Academic Text Noise Filter (OCR + ML)
213
+
214
+ - PDF OCR (scanned + digital)
215
+ - TXT processing
216
+ - Line classification
217
+ - Noise filtering
218
+ - Export cleaned text
219
+ """)
220
+
221
+ # ---------------- TEXT ----------------
222
+ with gr.Tab("Text"):
223
+
224
+ m = gr.Dropdown(list(MODELS.keys()), value=DEFAULT_MODEL)
225
+ t = gr.Textbox(lines=20, value=EXAMPLE_TEXT)
226
+ n = gr.CheckboxGroup(LABELS, value=DEFAULT_NOISE)
227
+
228
+ btn = gr.Button("Process")
229
+
230
+ out1 = gr.Textbox(lines=15)
231
+ out2 = gr.Textbox(lines=15)
232
+ out3 = gr.Textbox()
233
+ out4 = gr.File()
234
+
235
+ btn.click(process_text_input, [t, n, m], [out1, out2, out3, out4])
236
+
237
+ # ---------------- TXT ----------------
238
+ with gr.Tab("TXT File"):
239
+
240
+ m = gr.Dropdown(list(MODELS.keys()), value=DEFAULT_MODEL)
241
+ f = gr.File(file_types=[".txt"])
242
+ n = gr.CheckboxGroup(LABELS, value=DEFAULT_NOISE)
243
+
244
+ btn = gr.Button("Process")
245
+
246
+ out1 = gr.Textbox(lines=15)
247
+ out2 = gr.Textbox(lines=15)
248
+ out3 = gr.Textbox()
249
+ out4 = gr.File()
250
+
251
+ btn.click(process_document_file, [f, n, m], [out1, out2, out3, out4])
252
+
253
+ # ---------------- PDF ----------------
254
+ with gr.Tab("PDF (OCR + Text)"):
255
+
256
+ m = gr.Dropdown(list(MODELS.keys()), value=DEFAULT_MODEL)
257
+ f = gr.File(file_types=[".pdf"])
258
+ n = gr.CheckboxGroup(LABELS, value=DEFAULT_NOISE)
259
+
260
+ btn = gr.Button("Process PDF")
261
+
262
+ out1 = gr.Textbox(lines=15)
263
+ out2 = gr.Textbox(lines=15)
264
+ out3 = gr.Textbox()
265
+ out4 = gr.File()
266
+
267
+ btn.click(process_pdf_file, [f, n, m], [out1, out2, out3, out4])
268
+
269
+ # =========================================================
270
+ # Launch
271
+ # =========================================================
272
+
273
+ if __name__ == "__main__":
274
+ demo.launch()