Spaces:
Running
Running
| import gradio as gr | |
| from transformers import pipeline | |
| import pdfplumber | |
| import docx2txt | |
| from docx import Document | |
| from fpdf import FPDF | |
| # Загружаем модель | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| def read_file(file): | |
| """Чтение PDF, DOCX и TXT""" | |
| if file is None: | |
| return "" | |
| filename = file.name.lower() | |
| text = "" | |
| try: | |
| if filename.endswith(".pdf"): | |
| with pdfplumber.open(file.name) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| elif filename.endswith(".docx"): | |
| text = docx2txt.process(file.name) | |
| else: | |
| text = file.read().decode("utf-8", errors="ignore") | |
| except Exception as e: | |
| return f"Ошибка при чтении файла: {str(e)}" | |
| return text.strip() | |
| def summarize_text(text): | |
| """Суммаризация с безопасным делением на части""" | |
| if not text or len(text) < 50: | |
| return "⚠️ Недостаточно текста для суммаризации." | |
| chunk_size = 2500 # безопасная длина | |
| overlap = 200 # перекрытие между кусками | |
| summaries = [] | |
| for i in range(0, len(text), chunk_size - overlap): | |
| chunk = text[i:i + chunk_size] | |
| try: | |
| result = summarizer(chunk, max_length=180, min_length=40, do_sample=False) | |
| summaries.append(result[0]['summary_text']) | |
| except Exception as e: | |
| summaries.append(f"[Ошибка в части {len(summaries) + 1}: {str(e)}]") | |
| final_summary = "\n\n".join(summaries) | |
| return final_summary.strip() | |
| def save_summary_as_docx(summary_text): | |
| """Сохранение в DOCX""" | |
| doc = Document() | |
| doc.add_heading("Резюме документа", level=1) | |
| doc.add_paragraph(summary_text) | |
| path = "summary.docx" | |
| doc.save(path) | |
| return path | |
| def save_summary_as_txt(summary_text): | |
| """Сохранение в TXT""" | |
| path = "summary.txt" | |
| with open(path, "w", encoding="utf-8") as f: | |
| f.write(summary_text) | |
| return path | |
| def save_summary_as_pdf(summary_text): | |
| """Сохранение в PDF с русским шрифтом""" | |
| path = "summary.pdf" | |
| pdf = FPDF() | |
| pdf.add_page() | |
| # Используем встроенный шрифт DejaVuSans.ttf, который нужно будет добавить в проект | |
| try: | |
| pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True) | |
| pdf.set_font('DejaVu', '', 12) | |
| except: | |
| pdf.set_font("Arial", size=12) | |
| pdf.multi_cell(0, 10, summary_text) | |
| pdf.output(path) | |
| return path | |
| def summarize_file(file): | |
| """Главная функция""" | |
| text = read_file(file) | |
| if text.startswith("Ошибка"): | |
| return text, None, None, None | |
| summary = summarize_text(text) | |
| txt_path = save_summary_as_txt(summary) | |
| docx_path = save_summary_as_docx(summary) | |
| pdf_path = save_summary_as_pdf(summary) | |
| return summary, txt_path, docx_path, pdf_path | |
| # Интерфейс Gradio | |
| demo = gr.Interface( | |
| fn=summarize_file, | |
| inputs=gr.File(label="Загрузите документ (.pdf, .docx или .txt)"), | |
| outputs=[ | |
| gr.Textbox(label="Краткое резюме"), | |
| gr.File(label="📄 Скачать TXT"), | |
| gr.File(label="📘 Скачать DOCX"), | |
| gr.File(label="📕 Скачать PDF") | |
| ], | |
| title="🧠 Eroha Summarizer", | |
| description="Загрузите PDF, DOCX или TXT файл. Модель создаст краткое резюме и позволит скачать результат в любом формате.", | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |