docling / app.py
mgokg's picture
Update app.py
e6f565f verified
import gradio as gr
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
PdfPipelineOptions,
AcceleratorOptions
)
import spaces
from docling.datamodel.base_models import InputFormat
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from wordfreq import word_frequency
import tempfile
import io
import re
import os
import requests
import language_tool_python
import html
import time
_data = requests.get("https://specialist-it.de/data.json").json()
meine_orte = _data.get("meine_orte", [])
dummy = """
___|___|___|___|___|___|___|___|___|___|___|___|__
___|__<(-^,^-)=b_|___|___|___|___|___|___|___|___|
_|___|___|___|___|___|___|___|___|___|___|___|___|
___|___|___|___|___|___|___<'(o.o)'>|___|___|___|_
_|___|___|___|___|___|___|___|___|___|___|___|___|
___|___|_(-_-)zzz__|___|___|___|___|___|___|___|__
_|___|___|___|___|___|___|___|___|___|___|___|___|
___|___|___|___|___|___/ (^_^) /|___|___|___|___|_
_|___|___|___|___|___|___|___|___|___|___|___|___|
_|___|___|___|__d-(^_^)z_|___|___|___|___|___|___|
DUMMYIMAGE
"""
bad_string ="""<span style="color: red; text-decoration: underline;" title="Möglicher Tippfehler gefunden. (Vorschläge: Page, Lage, Tage)">page</span>"""
css = """
#spinner_md.pending::before {
content: "";
display: inline-block;
width: 20px;
height: 20px;
border: 3px solid #eee;
border-top-color: #2563eb; /* Gradio Blau */
border-radius: 50%;
animation: spin 1s linear infinite;
margin-right: 10px;
vertical-align: middle;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
#spinner_md.pending::after {
content: " Generiere Antwort...";
font-weight: bold;
color: #2563eb;
vertical-align: middle;
"""
tool = language_tool_python.LanguageTool('de-DE', new_spellings=meine_orte)
def replace_markdown_images(text):
pattern = r'!\[\]\(_page_\d+_Picture_\d+\.jpeg\)'
replacement = '<img src="dummy.svg">'
return re.sub(pattern, dummy, text)
def remove_hashes(text):
return re.sub(r'#', '', text)
# Docling
accelerator_options = AcceleratorOptions(
num_threads=8, device=AcceleratorDevice.CPU
)
pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = accelerator_options
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
docling_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
# Marker
marker_converter = PdfConverter(
artifact_dict=create_model_dict(),
)
def check_spelling(text_input: str) -> str:
"""Prüft den Text mit LanguageTool und markiert Fundstellen in HTML."""
IGNORE_RULES = {'DE_CASE'}
if tool is None:
return "❌ **Fehler:** LanguageTool konnte nicht geladen werden."
text = text_input or ""
if not text.strip():
return "*Bitte lade eine Datei hoch oder füge Text ein...*"
if text.startswith("❌") or text.startswith("Fehler"):
return text
matches = tool.check(text)
# Regel-Filter
matches = [m for m in matches if m.rule_id not in IGNORE_RULES]
# Englische Wörter herausfiltern
matches = _filter_english_words(text, matches)
if not matches:
return "<p style='color: green;'>✅ Keine Fehler gefunden!</p>"
# HTML-Aufbau für die Anzeige der Fehler
parts = []
text_len = len(text)
last_idx = text_len
for match in reversed(matches):
start = match.offset
end = match.offset + match.error_length
parts.append(html.escape(text[end:last_idx]))
word = text[start:end]
suggestions = ", ".join(match.replacements[:3]) if match.replacements else "keine Vorschläge"
error_html = (
f'<span style="color: red; text-decoration: underline;" '
f'title="{html.escape(match.message)} (Vorschläge: {html.escape(suggestions)})">'
f'{html.escape(word)}</span>'
)
parts.append(error_html)
last_idx = start
parts.append(html.escape(text[:last_idx]))
html_texte = "".join(reversed(parts)).replace("\n\n", "<br><br>")
html_text = html_texte.replace("\n", "<br>")
html_result = f"<div><strong>⚠️ {len(matches)} Fehler gefunden</strong><br><br>{html_text}</div>"
html_results = replace_markdown_images(html_result)
return remove_hashes(html_results)
def _filter_english_words(text: str, matches: list) -> list:
"""Filtert Treffer heraus, die gültige englische Wörter sind."""
clean_matches = []
for m in matches:
if m.rule_id == 'GERMAN_SPELLER_RULE':
word = text[m.offset:m.offset + m.error_length].lower()
if word_frequency(word, 'en') > 1e-6:
continue
clean_matches.append(m)
return clean_matches
def convert_document(file):
# load json from specialist-it.de/data.json and convert to python list. Name of python list is meine_orte.
_data = requests.get("https://specialist-it.de/data.json").json()
meine_orte = _data.get("meine_orte", [])
rendered = marker_converter(file.name)
text, _, images = text_from_rendered(rendered)
return check_spelling(text).replace(bad_string, "")
def add_ort(neuer_ort: str) -> str:
"""Sendet einen neuen Ort per POST an das PHP-Backend."""
neuer_ort = neuer_ort.strip()
if not neuer_ort:
return "⚠️ Bitte einen Ort eingeben."
try:
response = requests.post(
"https://specialist-it.de/wortliste.php",
json={"ort": neuer_ort},
timeout=10
)
data = response.json()
if response.status_code == 200 and data.get("success"):
return f"✅ '{neuer_ort}' erfolgreich hinzugefügt. Gesamt: {data.get('anzahl_orte')} Orte."
elif response.status_code == 409:
return f"ℹ️ '{neuer_ort}' existiert bereits in der Liste."
else:
return f"❌ Fehler: {data.get('error', 'Unbekannter Fehler')}"
except requests.exceptions.RequestException as e:
return f"❌ Verbindungsfehler: {str(e)}"
with gr.Blocks() as app:
gr.Markdown("# Language Tool \n")
output_text = gr.HTML(label="Ergebnis", elem_id="spinner_md")
file_input = gr.File(label="PDF hochladen", file_types=[".pdf"])
convert_button = gr.Button("Convert")
convert_button.click(
fn=convert_document,
inputs=[file_input],
outputs=[output_text],
show_progress="full"
)
gr.Markdown("---")
gr.Markdown("### Begriff zur Wortliste hinzufügen (Begriff wird nicht korrigiert)")
with gr.Row():
ort_input = gr.Textbox(
label="Ort",
placeholder="z. B. Würzburg",
scale=4
)
add_button = gr.Button("Hinzufügen", scale=1)
add_status = gr.Markdown()
add_button.click(
fn=add_ort,
inputs=[ort_input],
outputs=[add_status]
)
app.launch(debug=True, show_error=True, css=css)