import os import gradio as gr from transformers import pipeline import pdfplumber import docx2txt from docx import Document from fpdf import FPDF from langdetect import detect import urllib.request import gradio.themes as gt # === 🗂️ Создание папок и установка шрифта === os.makedirs("/app/models", exist_ok=True) FONT_PATH = "DejaVuSans.ttf" # Если шрифт отсутствует — скачиваем if not os.path.exists(FONT_PATH): print("⬇️ Загружаю шрифт DejaVuSans.ttf ...") urllib.request.urlretrieve( "https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSans.ttf", FONT_PATH ) # === ⚙️ Загрузка моделей один раз при запуске === def load_model(task, model_name): print(f"🔹 Загружается модель: {model_name}") return pipeline(task, model=model_name, cache_dir="/app/models") summarizers = { "en": load_model("summarization", "facebook/bart-large-cnn"), "ru": load_model("summarization", "IlyaGusev/mbart_ru_sum_gazeta"), "kz": load_model("summarization", "csebuetnlp/mT5_multilingual_XLSum") } # === 📄 Функция чтения текстов из файлов === def read_file(file): if not file: return "" filename = file.name.lower() text = "" try: if filename.endswith(".pdf"): with pdfplumber.open(file.name) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" elif filename.endswith(".docx"): text = docx2txt.process(file.name) else: text = file.read().decode("utf-8", errors="ignore") except Exception as e: return f"⚠️ Ошибка при чтении файла: {e}" return text.strip() # === 🌐 Определение языка === def detect_language(text): try: lang = detect(text) if lang.startswith("ru"): return "ru" elif lang.startswith("kk") or any(x in text for x in "әіңғүұқөһ"): return "kz" else: return "en" except: return "en" # === 🧠 Суммаризация текста === def summarize_text(text): if not text or len(text) < 50: return "⚠️ Недостаточно текста для анализа.", "❌", "❌", 0, 0, "❌" lang = detect_language(text) model = summarizers.get(lang, summarizers["en"]) flags = {"ru": "🇷🇺 Русский", "kz": "🇰🇿 Қазақ тілі", "en": "🇬🇧 English"} lang_label = flags.get(lang, "🌍 Unknown") model_label = model.model.name_or_path if hasattr(model.model, "name_or_path") else "Custom" chunk_size = 2500 overlap = 200 summaries = [] for i in range(0, len(text), chunk_size - overlap): chunk = text[i:i + chunk_size] try: result = model(chunk, max_length=180, min_length=40, do_sample=False) summaries.append(result[0]['summary_text']) except Exception as e: summaries.append(f"[Ошибка в части {len(summaries)+1}: {e}]") summary = "\n\n".join(summaries).strip() src_len = len(text) sum_len = len(summary) compression = round(100 * (1 - sum_len / src_len), 1) if src_len > 0 else 0 return summary, lang_label, model_label, src_len, sum_len, f"{compression}%" # === 💾 Сохранение файлов === def save_summary_as_txt(summary_text): path = "summary.txt" with open(path, "w", encoding="utf-8") as f: f.write(summary_text) return path def save_summary_as_docx(summary_text): path = "summary.docx" doc = Document() doc.add_heading("Резюме документа", level=1) doc.add_paragraph(summary_text) doc.save(path) return path def save_summary_as_pdf(summary_text): path = "summary.pdf" pdf = FPDF() pdf.add_page() pdf.add_font('DejaVu', '', FONT_PATH, uni=True) pdf.set_font('DejaVu', '', 12) pdf.multi_cell(0, 10, summary_text) pdf.output(path) return path # === 🚀 Главная функция обработки === def summarize_file(file): text = read_file(file) if text.startswith("⚠️"): return text, "❌", "❌", 0, 0, "❌", None, None, None with gr.Progress(track_tqdm=True) as progress: progress(0, desc="🧠 Анализ текста...") summary, lang_label, model_label, src_len, sum_len, compression = summarize_text(text) progress(1, desc="✅ Готово!") txt_path = save_summary_as_txt(summary) docx_path = save_summary_as_docx(summary) pdf_path = save_summary_as_pdf(summary) return summary, lang_label, model_label, src_len, sum_len, compression, txt_path, docx_path, pdf_path # === 🎨 Интерфейс Gradio === custom_theme = gt.Default( primary_hue="purple", secondary_hue="violet", ).set( body_background_fill="#f8f6ff", button_primary_background_fill="linear-gradient(90deg, #7e3ff2, #c084fc)", button_primary_background_fill_hover="linear-gradient(90deg, #6b21a8, #9333ea)", ) demo = gr.Interface( fn=summarize_file, inputs=gr.File(label="📂 Загрузите документ (.pdf, .docx, .txt)"), outputs=[ gr.Textbox(label="🧾 Краткое резюме"), gr.Textbox(label="🌍 Определённый язык"), gr.Textbox(label="🧠 Используемая модель"), gr.Number(label="📄 Длина исходного текста"), gr.Number(label="📝 Длина резюме"), gr.Textbox(label="📉 Степень сокращения"), gr.File(label="📄 Скачать TXT"), gr.File(label="📘 Скачать DOCX"), gr.File(label="📕 Скачать PDF"), ], title="🧠 Eroha Summarizer PRO (Автономная версия)", description="🚀 Определяет язык (🇷🇺 / 🇰🇿 / 🇬🇧), создаёт краткое резюме и сохраняет в TXT, DOCX, PDF с поддержкой кириллицы.", theme=custom_theme, allow_flagging="never" ) demo.launch(server_name="0.0.0.0", server_port=7860)