File size: 6,874 Bytes
de1981e e5a510a de1981e f26edf7 86cbf76 073a6b0 86cbf76 0bf0e9c 37e5860 e6f565f 86cbf76 4da2db4 12b8f59 fd40200 4da2db4 4e650af a311d6f 0bf0e9c 7106c25 4e650af 7106c25 0bf0e9c 7106c25 0bf0e9c 7106c25 4e650af 0bf0e9c 37e5860 22acb38 4da2db4 22acb38 42bf983 de1981e 4e650af de1981e 86cbf76 18976f4 4e650af 86cbf76 4e650af 86cbf76 27d21ff 18976f4 4e650af 86cbf76 4e650af 86cbf76 17140f0 86cbf76 22acb38 86cbf76 18976f4 f26edf7 18976f4 4e650af 84dfaeb b7bcd22 4e650af ed8bbb5 84dfaeb de1981e 073a6b0 9cb2628 073a6b0 cd796a9 5db8559 36ec280 2b0acca de1981e 7d34287 84dfaeb 9d903e1 de1981e 073a6b0 7d5f88b 073a6b0 4e650af | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | import gradio as gr
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
PdfPipelineOptions,
AcceleratorOptions
)
import spaces
from docling.datamodel.base_models import InputFormat
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from wordfreq import word_frequency
import tempfile
import io
import re
import os
import requests
import language_tool_python
import html
import time
_data = requests.get("https://specialist-it.de/data.json").json()
meine_orte = _data.get("meine_orte", [])
dummy = """
___|___|___|___|___|___|___|___|___|___|___|___|__
___|__<(-^,^-)=b_|___|___|___|___|___|___|___|___|
_|___|___|___|___|___|___|___|___|___|___|___|___|
___|___|___|___|___|___|___<'(o.o)'>|___|___|___|_
_|___|___|___|___|___|___|___|___|___|___|___|___|
___|___|_(-_-)zzz__|___|___|___|___|___|___|___|__
_|___|___|___|___|___|___|___|___|___|___|___|___|
___|___|___|___|___|___/ (^_^) /|___|___|___|___|_
_|___|___|___|___|___|___|___|___|___|___|___|___|
_|___|___|___|__d-(^_^)z_|___|___|___|___|___|___|
DUMMYIMAGE
"""
bad_string = """<span style="color: red; text-decoration: underline;" title="Möglicher Tippfehler gefunden. (Vorschläge: Page, Lage, Tage)">page</span>"""
css = """
#spinner_md.pending::before {
content: "";
display: inline-block;
width: 20px;
height: 20px;
border: 3px solid #eee;
border-top-color: #2563eb;
border-radius: 50%;
animation: spin 1s linear infinite;
margin-right: 10px;
vertical-align: middle;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
#spinner_md.pending::after {
content: " Generiere Antwort...";
font-weight: bold;
color: #2563eb;
vertical-align: middle;
}
"""
tool = language_tool_python.LanguageTool('de-DE', new_spellings=meine_orte)
def replace_markdown_images(text):
pattern = r'!\[\]\(_page_\d+_Picture_\d+\.jpeg\)'
return re.sub(pattern, dummy, text)
def remove_hashes(text):
return re.sub(r'#', '', text)
# Docling
accelerator_options = AcceleratorOptions(
num_threads=8, device=AcceleratorDevice.CPU
)
pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = accelerator_options
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
docling_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
# Marker — ✅ FIXED: PdfConverter replaces MarkerConverter
marker_converter = PdfConverter(artifact_dict=create_model_dict())
def check_spelling(text_input: str) -> str:
IGNORE_RULES = {'DE_CASE'}
if tool is None:
return "❌ **Fehler:** LanguageTool konnte nicht geladen werden."
text = text_input or ""
if not text.strip():
return "*Bitte lade eine Datei hoch oder füge Text ein...*"
if text.startswith("❌") or text.startswith("Fehler"):
return text
matches = tool.check(text)
matches = [m for m in matches if m.rule_id not in IGNORE_RULES]
matches = _filter_english_words(text, matches)
if not matches:
return "<p style='color: green;'>✅ Keine Fehler gefunden!</p>"
parts = []
text_len = len(text)
last_idx = text_len
for match in reversed(matches):
start = match.offset
end = match.offset + match.error_length
parts.append(html.escape(text[end:last_idx]))
word = text[start:end]
suggestions = ", ".join(match.replacements[:3]) if match.replacements else "keine Vorschläge"
error_html = (
f'<span style="color: red; text-decoration: underline;" '
f'title="{html.escape(match.message)} (Vorschläge: {html.escape(suggestions)})">'
f'{html.escape(word)}</span>'
)
parts.append(error_html)
last_idx = start
parts.append(html.escape(text[:last_idx]))
html_texte = "".join(reversed(parts)).replace("\n\n", "<br><br>")
html_text = html_texte.replace("\n", "<br>")
html_result = f"<div><strong>⚠️ {len(matches)} Fehler gefunden</strong><br><br>{html_text}</div>"
html_results = replace_markdown_images(html_result)
return remove_hashes(html_results)
def _filter_english_words(text: str, matches: list) -> list:
clean_matches = []
for m in matches:
if m.rule_id == 'GERMAN_SPELLER_RULE':
word = text[m.offset:m.offset + m.error_length].lower()
if word_frequency(word, 'en') > 1e-6:
continue
clean_matches.append(m)
return clean_matches
def convert_document(file):
_data = requests.get("https://specialist-it.de/data.json").json()
meine_orte = _data.get("meine_orte", [])
rendered = marker_converter(file.name) # ✅ same call, new converter
text, _, _ = text_from_rendered(rendered)
return check_spelling(text).replace(bad_string, "")
def add_ort(neuer_ort: str) -> str:
neuer_ort = neuer_ort.strip()
if not neuer_ort:
return "⚠️ Bitte einen Ort eingeben."
try:
response = requests.post(
"https://specialist-it.de/wortliste.php",
json={"ort": neuer_ort},
timeout=10
)
data = response.json()
if response.status_code == 200 and data.get("success"):
return f"✅ '{neuer_ort}' erfolgreich hinzugefügt. Gesamt: {data.get('anzahl_orte')} Orte."
elif response.status_code == 409:
return f"ℹ️ '{neuer_ort}' existiert bereits in der Liste."
else:
return f"❌ Fehler: {data.get('error', 'Unbekannter Fehler')}"
except requests.exceptions.RequestException as e:
return f"❌ Verbindungsfehler: {str(e)}"
with gr.Blocks() as app:
gr.Markdown("# Language Tool \n")
output_text = gr.HTML(label="Ergebnis", elem_id="spinner_md")
file_input = gr.File(label="PDF hochladen", file_types=[".pdf"])
convert_button = gr.Button("Convert")
convert_button.click(
fn=convert_document,
inputs=[file_input],
outputs=[output_text],
show_progress="full"
)
gr.Markdown("---")
gr.Markdown("### Begriff zur Wortliste hinzufügen (Begriff wird nicht korrigiert)")
with gr.Row():
ort_input = gr.Textbox(
label="Ort",
placeholder="z. B. Würzburg",
scale=4
)
add_button = gr.Button("Hinzufügen", scale=1)
add_status = gr.Markdown()
add_button.click(
fn=add_ort,
inputs=[ort_input],
outputs=[add_status]
)
app.launch(debug=True, show_error=True, css=css) |