| import gradio as gr |
| from docling.document_converter import DocumentConverter, PdfFormatOption |
| from docling.datamodel.pipeline_options import ( |
| AcceleratorDevice, |
| PdfPipelineOptions, |
| AcceleratorOptions |
| ) |
| import spaces |
| from docling.datamodel.base_models import InputFormat |
| from marker.converters.pdf import PdfConverter |
| from marker.models import create_model_dict |
| from marker.output import text_from_rendered |
| from wordfreq import word_frequency |
| import tempfile |
| import io |
| import re |
| import os |
| import requests |
| import language_tool_python |
| import html |
| import time |
|
|
| _data = requests.get("https://specialist-it.de/data.json").json() |
| meine_orte = _data.get("meine_orte", []) |
|
|
|
|
| dummy = """ |
| ___|___|___|___|___|___|___|___|___|___|___|___|__ |
| ___|__<(-^,^-)=b_|___|___|___|___|___|___|___|___| |
| _|___|___|___|___|___|___|___|___|___|___|___|___| |
| ___|___|___|___|___|___|___<'(o.o)'>|___|___|___|_ |
| _|___|___|___|___|___|___|___|___|___|___|___|___| |
| ___|___|_(-_-)zzz__|___|___|___|___|___|___|___|__ |
| _|___|___|___|___|___|___|___|___|___|___|___|___| |
| ___|___|___|___|___|___/ (^_^) /|___|___|___|___|_ |
| _|___|___|___|___|___|___|___|___|___|___|___|___| |
| _|___|___|___|__d-(^_^)z_|___|___|___|___|___|___| |
| DUMMYIMAGE |
| """ |
|
|
| bad_string ="""<span style="color: red; text-decoration: underline;" title="Möglicher Tippfehler gefunden. (Vorschläge: Page, Lage, Tage)">page</span>""" |
|
|
| css = """ |
| #spinner_md.pending::before { |
| content: ""; |
| display: inline-block; |
| width: 20px; |
| height: 20px; |
| border: 3px solid #eee; |
| border-top-color: #2563eb; /* Gradio Blau */ |
| border-radius: 50%; |
| animation: spin 1s linear infinite; |
| margin-right: 10px; |
| vertical-align: middle; |
| } |
| |
| @keyframes spin { |
| to { transform: rotate(360deg); } |
| } |
| |
| #spinner_md.pending::after { |
| content: " Generiere Antwort..."; |
| font-weight: bold; |
| color: #2563eb; |
| vertical-align: middle; |
| |
| """ |
|
|
| tool = language_tool_python.LanguageTool('de-DE', new_spellings=meine_orte) |
|
|
| def replace_markdown_images(text): |
| pattern = r'!\[\]\(_page_\d+_Picture_\d+\.jpeg\)' |
| replacement = '<img src="dummy.svg">' |
| return re.sub(pattern, dummy, text) |
|
|
| def remove_hashes(text): |
| return re.sub(r'#', '', text) |
|
|
| |
| accelerator_options = AcceleratorOptions( |
| num_threads=8, device=AcceleratorDevice.CPU |
| ) |
|
|
| pipeline_options = PdfPipelineOptions() |
| pipeline_options.accelerator_options = accelerator_options |
| pipeline_options.do_ocr = True |
| pipeline_options.do_table_structure = True |
| pipeline_options.table_structure_options.do_cell_matching = True |
|
|
| docling_converter = DocumentConverter( |
| format_options={ |
| InputFormat.PDF: PdfFormatOption( |
| pipeline_options=pipeline_options, |
| ) |
| } |
| ) |
|
|
| |
| marker_converter = PdfConverter( |
| artifact_dict=create_model_dict(), |
| ) |
|
|
|
|
| def check_spelling(text_input: str) -> str: |
| """Prüft den Text mit LanguageTool und markiert Fundstellen in HTML.""" |
| |
| IGNORE_RULES = {'DE_CASE'} |
| |
| if tool is None: |
| return "❌ **Fehler:** LanguageTool konnte nicht geladen werden." |
| text = text_input or "" |
| if not text.strip(): |
| return "*Bitte lade eine Datei hoch oder füge Text ein...*" |
| if text.startswith("❌") or text.startswith("Fehler"): |
| return text |
| |
| matches = tool.check(text) |
| |
| |
| matches = [m for m in matches if m.rule_id not in IGNORE_RULES] |
| |
| |
| matches = _filter_english_words(text, matches) |
| |
| if not matches: |
| return "<p style='color: green;'>✅ Keine Fehler gefunden!</p>" |
| |
| |
| parts = [] |
| text_len = len(text) |
| last_idx = text_len |
| for match in reversed(matches): |
| start = match.offset |
| end = match.offset + match.error_length |
| parts.append(html.escape(text[end:last_idx])) |
| word = text[start:end] |
| suggestions = ", ".join(match.replacements[:3]) if match.replacements else "keine Vorschläge" |
| error_html = ( |
| f'<span style="color: red; text-decoration: underline;" ' |
| f'title="{html.escape(match.message)} (Vorschläge: {html.escape(suggestions)})">' |
| f'{html.escape(word)}</span>' |
| ) |
| parts.append(error_html) |
| last_idx = start |
| parts.append(html.escape(text[:last_idx])) |
| html_texte = "".join(reversed(parts)).replace("\n\n", "<br><br>") |
| html_text = html_texte.replace("\n", "<br>") |
| html_result = f"<div><strong>⚠️ {len(matches)} Fehler gefunden</strong><br><br>{html_text}</div>" |
| html_results = replace_markdown_images(html_result) |
| return remove_hashes(html_results) |
|
|
|
|
| def _filter_english_words(text: str, matches: list) -> list: |
| """Filtert Treffer heraus, die gültige englische Wörter sind.""" |
| clean_matches = [] |
| for m in matches: |
| if m.rule_id == 'GERMAN_SPELLER_RULE': |
| word = text[m.offset:m.offset + m.error_length].lower() |
| if word_frequency(word, 'en') > 1e-6: |
| continue |
| clean_matches.append(m) |
| return clean_matches |
|
|
| def convert_document(file): |
| |
| _data = requests.get("https://specialist-it.de/data.json").json() |
| meine_orte = _data.get("meine_orte", []) |
| rendered = marker_converter(file.name) |
| text, _, images = text_from_rendered(rendered) |
| return check_spelling(text).replace(bad_string, "") |
|
|
|
|
| def add_ort(neuer_ort: str) -> str: |
| """Sendet einen neuen Ort per POST an das PHP-Backend.""" |
| neuer_ort = neuer_ort.strip() |
| if not neuer_ort: |
| return "⚠️ Bitte einen Ort eingeben." |
| try: |
| response = requests.post( |
| "https://specialist-it.de/wortliste.php", |
| json={"ort": neuer_ort}, |
| timeout=10 |
| ) |
| data = response.json() |
| if response.status_code == 200 and data.get("success"): |
| return f"✅ '{neuer_ort}' erfolgreich hinzugefügt. Gesamt: {data.get('anzahl_orte')} Orte." |
| elif response.status_code == 409: |
| return f"ℹ️ '{neuer_ort}' existiert bereits in der Liste." |
| else: |
| return f"❌ Fehler: {data.get('error', 'Unbekannter Fehler')}" |
| except requests.exceptions.RequestException as e: |
| return f"❌ Verbindungsfehler: {str(e)}" |
|
|
|
|
| with gr.Blocks() as app: |
| gr.Markdown("# Language Tool \n") |
| output_text = gr.HTML(label="Ergebnis", elem_id="spinner_md") |
| file_input = gr.File(label="PDF hochladen", file_types=[".pdf"]) |
|
|
| convert_button = gr.Button("Convert") |
| convert_button.click( |
| fn=convert_document, |
| inputs=[file_input], |
| outputs=[output_text], |
| show_progress="full" |
| ) |
|
|
| gr.Markdown("---") |
| gr.Markdown("### Begriff zur Wortliste hinzufügen (Begriff wird nicht korrigiert)") |
| with gr.Row(): |
| ort_input = gr.Textbox( |
| label="Ort", |
| placeholder="z. B. Würzburg", |
| scale=4 |
| ) |
| add_button = gr.Button("Hinzufügen", scale=1) |
| add_status = gr.Markdown() |
|
|
| add_button.click( |
| fn=add_ort, |
| inputs=[ort_input], |
| outputs=[add_status] |
| ) |
|
|
| app.launch(debug=True, show_error=True, css=css) |
|
|