Spaces:

c-ho
/

test_text_filter

Sleeping

App Files Files Community

c-ho commited on 7 days ago

Commit

47acc6b

verified ·

1 Parent(s): 2b812f0

Create app.py

Browse files

Files changed (1) hide show

app.py +274 -0

app.py ADDED Viewed

	@@ -0,0 +1,274 @@

+# =========================================================
+# app.py (FINAL OCR + CLASSIFICATION PIPELINE)
+# =========================================================
+import gradio as gr
+from transformers import pipeline
+from pypdf import PdfReader
+from pdf2image import convert_from_path
+import pytesseract
+import tempfile
+# =========================================================
+# Available Models
+# =========================================================
+MODELS = {
+    "English model (ubffm/academic_text_classifier_en)": "ubffm/academic_text_classifier_en",
+    "German model (ubffm/academic_text_classifier_de)": "ubffm/academic_text_classifier_de",
+}
+DEFAULT_MODEL = "English model (ubffm/academic_text_classifier_en)"
+# =========================================================
+# Example Text
+# =========================================================
+EXAMPLE_TEXT = """
+Microsoft Word - 08-Zimmermann-ISIS6-final.doc
+Contrastive Focus
+Malte Zimmermann
+Humboldt University
+The article puts forward a discourse-pragmatic approach...
+"""
+# =========================================================
+# Labels
+# =========================================================
+LABELS = [
+    "OUT OF SCOPE",
+    "MAIN TEXT",
+    "EXAMPLE",
+    "REFERENCE"
+]
+DEFAULT_NOISE = ["OUT OF SCOPE", "REFERENCE"]
+# =========================================================
+# Pipeline cache
+# =========================================================
+PIPELINES = {}
+def get_classifier(model_display_name):
+    model_name = MODELS[model_display_name]
+    if model_name not in PIPELINES:
+        PIPELINES[model_name] = pipeline(
+            "text-classification",
+            model=model_name,
+            tokenizer=model_name,
+            return_all_scores=True
+        )
+    return PIPELINES[model_name]
+# =========================================================
+# Prediction helper
+# =========================================================
+def get_best_prediction(classifier, text):
+    result = classifier(text)
+    if isinstance(result, list) and len(result) > 0:
+        if isinstance(result[0], list):
+            result = result[0]
+    best = max(result, key=lambda x: x["score"])
+    return best, result
+# =========================================================
+# Clean empty lines
+# =========================================================
+def normalize_empty_lines(lines):
+    cleaned = []
+    prev_empty = False
+    for line in lines:
+        empty = not line.strip()
+        if empty and prev_empty:
+            continue
+        cleaned.append(line)
+        prev_empty = empty
+    return cleaned
+# =========================================================
+# TEXT processing
+# =========================================================
+def process_text_input(text, noise_labels, selected_model):
+    if not text.strip():
+        return "", "", "", None
+    classifier = get_classifier(selected_model)
+    lines = text.splitlines()
+    kept, removed, logs = [], [], []
+    for i, line in enumerate(lines, 1):
+        if not line.strip():
+            kept.append("")
+            continue
+        pred, _ = get_best_prediction(classifier, line)
+        logs.append(f"Line {i} | {pred['label']} ({pred['score']:.4f})\n{line}\n")
+        if pred["label"] in noise_labels:
+            removed.append(line)
+        else:
+            kept.append(line)
+    kept = normalize_empty_lines(kept)
+    filtered = "\n".join(kept)
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8")
+    tmp.write(filtered)
+    tmp.close()
+    stats = (
+        f"Model: {selected_model}\n"
+        f"Total lines: {len(lines)}\n"
+        f"Removed: {len(removed)}\n"
+        f"Remaining: {len(kept)}"
+    )
+    return "\n".join(logs), filtered, stats, tmp.name
+# =========================================================
+# TXT file processing
+# =========================================================
+def process_document_file(file, noise_labels, selected_model):
+    if file is None:
+        return "", "", "", None
+    with open(file.name, "r", encoding="utf-8") as f:
+        text = f.read()
+    return process_text_input(text, noise_labels, selected_model)
+# =========================================================
+# PDF OCR + extraction
+# =========================================================
+def extract_text_from_pdf(pdf_file):
+    text_parts = []
+    # 1. Try digital PDF extraction
+    try:
+        reader = PdfReader(pdf_file.name)
+        for page in reader.pages:
+            txt = page.extract_text()
+            if txt and txt.strip():
+                text_parts.append(txt)
+    except:
+        pass
+    text = "\n".join(text_parts).strip()
+    # 2. If empty → OCR fallback
+    if not text:
+        pages = convert_from_path(pdf_file.name, dpi=300)
+        ocr_text = []
+        for page in pages:
+            ocr_text.append(pytesseract.image_to_string(page))
+        text = "\n".join(ocr_text)
+    return text
+# =========================================================
+# PDF processing
+# =========================================================
+def process_pdf_file(file, noise_labels, selected_model):
+    if file is None:
+        return "", "", "", None
+    text = extract_text_from_pdf(file)
+    return process_text_input(text, noise_labels, selected_model)
+# =========================================================
+# UI
+# =========================================================
+with gr.Blocks(title="Academic Text Noise Filter") as demo:
+    gr.Markdown("""
+    # Academic Text Noise Filter (OCR + ML)
+    - PDF OCR (scanned + digital)
+    - TXT processing
+    - Line classification
+    - Noise filtering
+    - Export cleaned text
+    """)
+    # ---------------- TEXT ----------------
+    with gr.Tab("Text"):
+        m = gr.Dropdown(list(MODELS.keys()), value=DEFAULT_MODEL)
+        t = gr.Textbox(lines=20, value=EXAMPLE_TEXT)
+        n = gr.CheckboxGroup(LABELS, value=DEFAULT_NOISE)
+        btn = gr.Button("Process")
+        out1 = gr.Textbox(lines=15)
+        out2 = gr.Textbox(lines=15)
+        out3 = gr.Textbox()
+        out4 = gr.File()
+        btn.click(process_text_input, [t, n, m], [out1, out2, out3, out4])
+    # ---------------- TXT ----------------
+    with gr.Tab("TXT File"):
+        m = gr.Dropdown(list(MODELS.keys()), value=DEFAULT_MODEL)
+        f = gr.File(file_types=[".txt"])
+        n = gr.CheckboxGroup(LABELS, value=DEFAULT_NOISE)
+        btn = gr.Button("Process")
+        out1 = gr.Textbox(lines=15)
+        out2 = gr.Textbox(lines=15)
+        out3 = gr.Textbox()
+        out4 = gr.File()
+        btn.click(process_document_file, [f, n, m], [out1, out2, out3, out4])
+    # ---------------- PDF ----------------
+    with gr.Tab("PDF (OCR + Text)"):
+        m = gr.Dropdown(list(MODELS.keys()), value=DEFAULT_MODEL)
+        f = gr.File(file_types=[".pdf"])
+        n = gr.CheckboxGroup(LABELS, value=DEFAULT_NOISE)
+        btn = gr.Button("Process PDF")
+        out1 = gr.Textbox(lines=15)
+        out2 = gr.Textbox(lines=15)
+        out3 = gr.Textbox()
+        out4 = gr.File()
+        btn.click(process_pdf_file, [f, n, m], [out1, out2, out3, out4])
+# =========================================================
+# Launch
+# =========================================================
+if __name__ == "__main__":
+    demo.launch()