import gradio as gr from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.pipeline_options import ( AcceleratorDevice, PdfPipelineOptions, AcceleratorOptions ) import spaces from docling.datamodel.base_models import InputFormat from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.output import text_from_rendered from wordfreq import word_frequency import tempfile import io import re import os import requests import language_tool_python import html import time _data = requests.get("https://specialist-it.de/data.json").json() meine_orte = _data.get("meine_orte", []) dummy = """ ___|___|___|___|___|___|___|___|___|___|___|___|__ ___|__<(-^,^-)=b_|___|___|___|___|___|___|___|___| _|___|___|___|___|___|___|___|___|___|___|___|___| ___|___|___|___|___|___|___<'(o.o)'>|___|___|___|_ _|___|___|___|___|___|___|___|___|___|___|___|___| ___|___|_(-_-)zzz__|___|___|___|___|___|___|___|__ _|___|___|___|___|___|___|___|___|___|___|___|___| ___|___|___|___|___|___/ (^_^) /|___|___|___|___|_ _|___|___|___|___|___|___|___|___|___|___|___|___| _|___|___|___|__d-(^_^)z_|___|___|___|___|___|___| DUMMYIMAGE """ bad_string ="""page""" css = """ #spinner_md.pending::before { content: ""; display: inline-block; width: 20px; height: 20px; border: 3px solid #eee; border-top-color: #2563eb; /* Gradio Blau */ border-radius: 50%; animation: spin 1s linear infinite; margin-right: 10px; vertical-align: middle; } @keyframes spin { to { transform: rotate(360deg); } } #spinner_md.pending::after { content: " Generiere Antwort..."; font-weight: bold; color: #2563eb; vertical-align: middle; """ tool = language_tool_python.LanguageTool('de-DE', new_spellings=meine_orte) def replace_markdown_images(text): pattern = r'!\[\]\(_page_\d+_Picture_\d+\.jpeg\)' replacement = '' return re.sub(pattern, dummy, text) def remove_hashes(text): return re.sub(r'#', '', text) # Docling accelerator_options = AcceleratorOptions( num_threads=8, device=AcceleratorDevice.CPU ) pipeline_options = PdfPipelineOptions() pipeline_options.accelerator_options = accelerator_options pipeline_options.do_ocr = True pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True docling_converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, ) } ) # Marker marker_converter = PdfConverter( artifact_dict=create_model_dict(), ) def check_spelling(text_input: str) -> str: """Prüft den Text mit LanguageTool und markiert Fundstellen in HTML.""" IGNORE_RULES = {'DE_CASE'} if tool is None: return "❌ **Fehler:** LanguageTool konnte nicht geladen werden." text = text_input or "" if not text.strip(): return "*Bitte lade eine Datei hoch oder füge Text ein...*" if text.startswith("❌") or text.startswith("Fehler"): return text matches = tool.check(text) # Regel-Filter matches = [m for m in matches if m.rule_id not in IGNORE_RULES] # Englische Wörter herausfiltern matches = _filter_english_words(text, matches) if not matches: return "

✅ Keine Fehler gefunden!

" # HTML-Aufbau für die Anzeige der Fehler parts = [] text_len = len(text) last_idx = text_len for match in reversed(matches): start = match.offset end = match.offset + match.error_length parts.append(html.escape(text[end:last_idx])) word = text[start:end] suggestions = ", ".join(match.replacements[:3]) if match.replacements else "keine Vorschläge" error_html = ( f'' f'{html.escape(word)}' ) parts.append(error_html) last_idx = start parts.append(html.escape(text[:last_idx])) html_texte = "".join(reversed(parts)).replace("\n\n", "

") html_text = html_texte.replace("\n", "
") html_result = f"
⚠️ {len(matches)} Fehler gefunden

{html_text}
" html_results = replace_markdown_images(html_result) return remove_hashes(html_results) def _filter_english_words(text: str, matches: list) -> list: """Filtert Treffer heraus, die gültige englische Wörter sind.""" clean_matches = [] for m in matches: if m.rule_id == 'GERMAN_SPELLER_RULE': word = text[m.offset:m.offset + m.error_length].lower() if word_frequency(word, 'en') > 1e-6: continue clean_matches.append(m) return clean_matches def convert_document(file): # load json from specialist-it.de/data.json and convert to python list. Name of python list is meine_orte. _data = requests.get("https://specialist-it.de/data.json").json() meine_orte = _data.get("meine_orte", []) rendered = marker_converter(file.name) text, _, images = text_from_rendered(rendered) return check_spelling(text).replace(bad_string, "") def add_ort(neuer_ort: str) -> str: """Sendet einen neuen Ort per POST an das PHP-Backend.""" neuer_ort = neuer_ort.strip() if not neuer_ort: return "⚠️ Bitte einen Ort eingeben." try: response = requests.post( "https://specialist-it.de/wortliste.php", json={"ort": neuer_ort}, timeout=10 ) data = response.json() if response.status_code == 200 and data.get("success"): return f"✅ '{neuer_ort}' erfolgreich hinzugefügt. Gesamt: {data.get('anzahl_orte')} Orte." elif response.status_code == 409: return f"ℹ️ '{neuer_ort}' existiert bereits in der Liste." else: return f"❌ Fehler: {data.get('error', 'Unbekannter Fehler')}" except requests.exceptions.RequestException as e: return f"❌ Verbindungsfehler: {str(e)}" with gr.Blocks() as app: gr.Markdown("# Language Tool \n") output_text = gr.HTML(label="Ergebnis", elem_id="spinner_md") file_input = gr.File(label="PDF hochladen", file_types=[".pdf"]) convert_button = gr.Button("Convert") convert_button.click( fn=convert_document, inputs=[file_input], outputs=[output_text], show_progress="full" ) gr.Markdown("---") gr.Markdown("### Begriff zur Wortliste hinzufügen (Begriff wird nicht korrigiert)") with gr.Row(): ort_input = gr.Textbox( label="Ort", placeholder="z. B. Würzburg", scale=4 ) add_button = gr.Button("Hinzufügen", scale=1) add_status = gr.Markdown() add_button.click( fn=add_ort, inputs=[ort_input], outputs=[add_status] ) app.launch(debug=True, show_error=True, css=css)