Spaces:
Runtime error
Runtime error
| import os | |
| import gradio as gr | |
| from transformers import pipeline | |
| import pdfplumber | |
| import docx2txt | |
| from docx import Document | |
| from fpdf import FPDF | |
| from langdetect import detect | |
| import urllib.request | |
| # === Настройки === | |
| os.makedirs("/app/models", exist_ok=True) | |
| FONT_PATH = "DejaVuSans.ttf" | |
| if not os.path.exists(FONT_PATH): | |
| urllib.request.urlretrieve( | |
| "https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSans.ttf", | |
| FONT_PATH | |
| ) | |
| # === Модели === | |
| def load_model(task, name): | |
| print(f"🔹 Загружается модель: {name}") | |
| return pipeline(task, model=name, cache_dir="/app/models") | |
| models = { | |
| "en": load_model("summarization", "facebook/bart-large-cnn"), | |
| "ru": load_model("summarization", "IlyaGusev/mbart_ru_sum_gazeta"), | |
| "kz": load_model("summarization", "csebuetnlp/mT5_multilingual_XLSum"), | |
| } | |
| # === Чтение файлов === | |
| def read_file(file): | |
| if not file: | |
| return "" | |
| path = file.name.lower() | |
| text = "" | |
| try: | |
| if path.endswith(".pdf"): | |
| with pdfplumber.open(file.name) as pdf: | |
| for page in pdf.pages: | |
| t = page.extract_text() | |
| if t: | |
| text += t + "\n" | |
| elif path.endswith(".docx"): | |
| text = docx2txt.process(file.name) | |
| else: | |
| text = file.read().decode("utf-8", errors="ignore") | |
| except Exception as e: | |
| return f"⚠️ Ошибка при чтении: {e}" | |
| return text.strip() | |
| # === Определение языка === | |
| def detect_language(text): | |
| try: | |
| lang = detect(text) | |
| if lang.startswith("ru"): | |
| return "ru" | |
| elif lang.startswith("kk") or any(c in text for c in "әіңғүұқөһ"): | |
| return "kz" | |
| else: | |
| return "en" | |
| except: | |
| return "en" | |
| # === Суммаризация === | |
| def summarize_text(text): | |
| if not text or len(text) < 50: | |
| return "⚠️ Недостаточно текста", "❌", "❌", 0, 0, "❌" | |
| lang = detect_language(text) | |
| model = models.get(lang, models["en"]) | |
| flags = {"ru": "🇷🇺 Русский", "kz": "🇰🇿 Қазақ тілі", "en": "🇬🇧 English"} | |
| lang_label = flags.get(lang, "🌍 Unknown") | |
| model_label = getattr(model.model, "name_or_path", "Custom") | |
| chunk_size = 2500 | |
| overlap = 200 | |
| summaries = [] | |
| for i in range(0, len(text), chunk_size - overlap): | |
| chunk = text[i:i+chunk_size] | |
| try: | |
| res = model(chunk, max_length=180, min_length=40, do_sample=False) | |
| summaries.append(res[0]['summary_text']) | |
| except Exception as e: | |
| summaries.append(f"[Ошибка в части {len(summaries)+1}: {e}]") | |
| summary = "\n\n".join(summaries).strip() | |
| src_len = len(text) | |
| sum_len = len(summary) | |
| comp = f"{round(100*(1 - sum_len/src_len),1)}%" if src_len else "0%" | |
| return summary, lang_label, model_label, src_len, sum_len, comp | |
| # === Сохранение === | |
| def save_summary(summary): | |
| txt, docx, pdf = "summary.txt", "summary.docx", "summary.pdf" | |
| with open(txt, "w", encoding="utf-8") as f: | |
| f.write(summary) | |
| d = Document() | |
| d.add_heading("Резюме", 0) | |
| d.add_paragraph(summary) | |
| d.save(docx) | |
| pdf_doc = FPDF() | |
| pdf_doc.add_page() | |
| pdf_doc.add_font('DejaVu', '', FONT_PATH, uni=True) | |
| pdf_doc.set_font('DejaVu', '', 12) | |
| pdf_doc.multi_cell(0, 10, summary) | |
| pdf_doc.output(pdf) | |
| return txt, docx, pdf | |
| # === Основная функция === | |
| def summarize_file(file, progress=gr.Progress()): | |
| text = read_file(file) | |
| if text.startswith("⚠️"): | |
| return text, "❌", "❌", 0, 0, "❌", None, None, None | |
| progress(0.2, "🧠 Анализ текста...") | |
| summary, lang, model, src, summ, comp = summarize_text(text) | |
| progress(0.8, "📄 Сохранение файлов...") | |
| txt, docx, pdf = save_summary(summary) | |
| progress(1, "✅ Готово!") | |
| return summary, lang, model, src, summ, comp, txt, docx, pdf | |
| # === Интерфейс === | |
| with gr.Blocks() as demo: | |
| gr.Markdown("<h2 style='text-align:center'>🧠 Eroha Summarizer PRO</h2>") | |
| file_input = gr.File(label="📂 Загрузите документ (.pdf, .docx, .txt)") | |
| run_btn = gr.Button("🔍 Сгенерировать резюме", variant="primary") | |
| summary_out = gr.Textbox(label="🧾 Краткое резюме", lines=10) | |
| lang_out = gr.Textbox(label="🌍 Определённый язык") | |
| model_out = gr.Textbox(label="🧠 Используемая модель") | |
| src_len = gr.Number(label="📄 Длина исходного текста") | |
| sum_len = gr.Number(label="📝 Длина резюме") | |
| comp = gr.Textbox(label="📉 Степень сокращения") | |
| txt = gr.File(label="📄 TXT") | |
| docx = gr.File(label="📘 DOCX") | |
| pdf = gr.File(label="📕 PDF") | |
| run_btn.click( | |
| summarize_file, | |
| inputs=file_input, | |
| outputs=[summary_out, lang_out, model_out, src_len, sum_len, comp, txt, docx, pdf], | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |