import gradio as gr
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
PdfPipelineOptions,
AcceleratorOptions
)
import spaces
from docling.datamodel.base_models import InputFormat
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from wordfreq import word_frequency
import tempfile
import io
import re
import os
import requests
import language_tool_python
import html
import time
_data = requests.get("https://specialist-it.de/data.json").json()
meine_orte = _data.get("meine_orte", [])
dummy = """
___|___|___|___|___|___|___|___|___|___|___|___|__
___|__<(-^,^-)=b_|___|___|___|___|___|___|___|___|
_|___|___|___|___|___|___|___|___|___|___|___|___|
___|___|___|___|___|___|___<'(o.o)'>|___|___|___|_
_|___|___|___|___|___|___|___|___|___|___|___|___|
___|___|_(-_-)zzz__|___|___|___|___|___|___|___|__
_|___|___|___|___|___|___|___|___|___|___|___|___|
___|___|___|___|___|___/ (^_^) /|___|___|___|___|_
_|___|___|___|___|___|___|___|___|___|___|___|___|
_|___|___|___|__d-(^_^)z_|___|___|___|___|___|___|
DUMMYIMAGE
"""
bad_string ="""page"""
css = """
#spinner_md.pending::before {
content: "";
display: inline-block;
width: 20px;
height: 20px;
border: 3px solid #eee;
border-top-color: #2563eb; /* Gradio Blau */
border-radius: 50%;
animation: spin 1s linear infinite;
margin-right: 10px;
vertical-align: middle;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
#spinner_md.pending::after {
content: " Generiere Antwort...";
font-weight: bold;
color: #2563eb;
vertical-align: middle;
"""
tool = language_tool_python.LanguageTool('de-DE', new_spellings=meine_orte)
def replace_markdown_images(text):
pattern = r'!\[\]\(_page_\d+_Picture_\d+\.jpeg\)'
replacement = ''
return re.sub(pattern, dummy, text)
def remove_hashes(text):
return re.sub(r'#', '', text)
# Docling
accelerator_options = AcceleratorOptions(
num_threads=8, device=AcceleratorDevice.CPU
)
pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = accelerator_options
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
docling_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)
# Marker
marker_converter = PdfConverter(
artifact_dict=create_model_dict(),
)
def check_spelling(text_input: str) -> str:
"""Prüft den Text mit LanguageTool und markiert Fundstellen in HTML."""
IGNORE_RULES = {'DE_CASE'}
if tool is None:
return "❌ **Fehler:** LanguageTool konnte nicht geladen werden."
text = text_input or ""
if not text.strip():
return "*Bitte lade eine Datei hoch oder füge Text ein...*"
if text.startswith("❌") or text.startswith("Fehler"):
return text
matches = tool.check(text)
# Regel-Filter
matches = [m for m in matches if m.rule_id not in IGNORE_RULES]
# Englische Wörter herausfiltern
matches = _filter_english_words(text, matches)
if not matches:
return "
✅ Keine Fehler gefunden!
" # HTML-Aufbau für die Anzeige der Fehler parts = [] text_len = len(text) last_idx = text_len for match in reversed(matches): start = match.offset end = match.offset + match.error_length parts.append(html.escape(text[end:last_idx])) word = text[start:end] suggestions = ", ".join(match.replacements[:3]) if match.replacements else "keine Vorschläge" error_html = ( f'' f'{html.escape(word)}' ) parts.append(error_html) last_idx = start parts.append(html.escape(text[:last_idx])) html_texte = "".join(reversed(parts)).replace("\n\n", "