Spaces:
Running
Running
| import os | |
| import gradio as gr | |
| from transformers import pipeline | |
| import pdfplumber | |
| import docx2txt | |
| from docx import Document | |
| from fpdf import FPDF | |
| from langdetect import detect | |
| import urllib.request | |
| import gradio.themes as gt | |
| # === 🗂️ Создание папок и установка шрифта === | |
| os.makedirs("/app/models", exist_ok=True) | |
| FONT_PATH = "DejaVuSans.ttf" | |
| # Если шрифт отсутствует — скачиваем | |
| if not os.path.exists(FONT_PATH): | |
| print("⬇️ Загружаю шрифт DejaVuSans.ttf ...") | |
| urllib.request.urlretrieve( | |
| "https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSans.ttf", | |
| FONT_PATH | |
| ) | |
| # === ⚙️ Загрузка моделей один раз при запуске === | |
| def load_model(task, model_name): | |
| print(f"🔹 Загружается модель: {model_name}") | |
| return pipeline(task, model=model_name, cache_dir="/app/models") | |
| summarizers = { | |
| "en": load_model("summarization", "facebook/bart-large-cnn"), | |
| "ru": load_model("summarization", "IlyaGusev/mbart_ru_sum_gazeta"), | |
| "kz": load_model("summarization", "csebuetnlp/mT5_multilingual_XLSum") | |
| } | |
| # === 📄 Функция чтения текстов из файлов === | |
| def read_file(file): | |
| if not file: | |
| return "" | |
| filename = file.name.lower() | |
| text = "" | |
| try: | |
| if filename.endswith(".pdf"): | |
| with pdfplumber.open(file.name) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| elif filename.endswith(".docx"): | |
| text = docx2txt.process(file.name) | |
| else: | |
| text = file.read().decode("utf-8", errors="ignore") | |
| except Exception as e: | |
| return f"⚠️ Ошибка при чтении файла: {e}" | |
| return text.strip() | |
| # === 🌐 Определение языка === | |
| def detect_language(text): | |
| try: | |
| lang = detect(text) | |
| if lang.startswith("ru"): | |
| return "ru" | |
| elif lang.startswith("kk") or any(x in text for x in "әіңғүұқөһ"): | |
| return "kz" | |
| else: | |
| return "en" | |
| except: | |
| return "en" | |
| # === 🧠 Суммаризация текста === | |
| def summarize_text(text): | |
| if not text or len(text) < 50: | |
| return "⚠️ Недостаточно текста для анализа.", "❌", "❌", 0, 0, "❌" | |
| lang = detect_language(text) | |
| model = summarizers.get(lang, summarizers["en"]) | |
| flags = {"ru": "🇷🇺 Русский", "kz": "🇰🇿 Қазақ тілі", "en": "🇬🇧 English"} | |
| lang_label = flags.get(lang, "🌍 Unknown") | |
| model_label = model.model.name_or_path if hasattr(model.model, "name_or_path") else "Custom" | |
| chunk_size = 2500 | |
| overlap = 200 | |
| summaries = [] | |
| for i in range(0, len(text), chunk_size - overlap): | |
| chunk = text[i:i + chunk_size] | |
| try: | |
| result = model(chunk, max_length=180, min_length=40, do_sample=False) | |
| summaries.append(result[0]['summary_text']) | |
| except Exception as e: | |
| summaries.append(f"[Ошибка в части {len(summaries)+1}: {e}]") | |
| summary = "\n\n".join(summaries).strip() | |
| src_len = len(text) | |
| sum_len = len(summary) | |
| compression = round(100 * (1 - sum_len / src_len), 1) if src_len > 0 else 0 | |
| return summary, lang_label, model_label, src_len, sum_len, f"{compression}%" | |
| # === 💾 Сохранение файлов === | |
| def save_summary_as_txt(summary_text): | |
| path = "summary.txt" | |
| with open(path, "w", encoding="utf-8") as f: | |
| f.write(summary_text) | |
| return path | |
| def save_summary_as_docx(summary_text): | |
| path = "summary.docx" | |
| doc = Document() | |
| doc.add_heading("Резюме документа", level=1) | |
| doc.add_paragraph(summary_text) | |
| doc.save(path) | |
| return path | |
| def save_summary_as_pdf(summary_text): | |
| path = "summary.pdf" | |
| pdf = FPDF() | |
| pdf.add_page() | |
| pdf.add_font('DejaVu', '', FONT_PATH, uni=True) | |
| pdf.set_font('DejaVu', '', 12) | |
| pdf.multi_cell(0, 10, summary_text) | |
| pdf.output(path) | |
| return path | |
| # === 🚀 Главная функция обработки === | |
| def summarize_file(file): | |
| text = read_file(file) | |
| if text.startswith("⚠️"): | |
| return text, "❌", "❌", 0, 0, "❌", None, None, None | |
| with gr.Progress(track_tqdm=True) as progress: | |
| progress(0, desc="🧠 Анализ текста...") | |
| summary, lang_label, model_label, src_len, sum_len, compression = summarize_text(text) | |
| progress(1, desc="✅ Готово!") | |
| txt_path = save_summary_as_txt(summary) | |
| docx_path = save_summary_as_docx(summary) | |
| pdf_path = save_summary_as_pdf(summary) | |
| return summary, lang_label, model_label, src_len, sum_len, compression, txt_path, docx_path, pdf_path | |
| # === 🎨 Интерфейс Gradio === | |
| custom_theme = gt.Default( | |
| primary_hue="purple", | |
| secondary_hue="violet", | |
| ).set( | |
| body_background_fill="#f8f6ff", | |
| button_primary_background_fill="linear-gradient(90deg, #7e3ff2, #c084fc)", | |
| button_primary_background_fill_hover="linear-gradient(90deg, #6b21a8, #9333ea)", | |
| ) | |
| demo = gr.Interface( | |
| fn=summarize_file, | |
| inputs=gr.File(label="📂 Загрузите документ (.pdf, .docx, .txt)"), | |
| outputs=[ | |
| gr.Textbox(label="🧾 Краткое резюме"), | |
| gr.Textbox(label="🌍 Определённый язык"), | |
| gr.Textbox(label="🧠 Используемая модель"), | |
| gr.Number(label="📄 Длина исходного текста"), | |
| gr.Number(label="📝 Длина резюме"), | |
| gr.Textbox(label="📉 Степень сокращения"), | |
| gr.File(label="📄 Скачать TXT"), | |
| gr.File(label="📘 Скачать DOCX"), | |
| gr.File(label="📕 Скачать PDF"), | |
| ], | |
| title="🧠 Eroha Summarizer PRO (Автономная версия)", | |
| description="🚀 Определяет язык (🇷🇺 / 🇰🇿 / 🇬🇧), создаёт краткое резюме и сохраняет в TXT, DOCX, PDF с поддержкой кириллицы.", | |
| theme=custom_theme, | |
| allow_flagging="never" | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |