Spaces:
Running
Running
File size: 6,384 Bytes
2b1b52c ad5f036 71b5660 9c815d5 9e85362 50ad747 4d0947f d93b7e1 cbb7d22 11a33db cbb7d22 2b1b52c cbb7d22 2b1b52c cbb7d22 2b1b52c cbb7d22 2b1b52c d93b7e1 2b1b52c 4d0947f 2b1b52c 4d0947f 71b5660 cbb7d22 9c815d5 0d0b783 928a3cd 9e85362 50ad747 cbb7d22 9e85362 d93b7e1 cbb7d22 9e85362 9c815d5 cbb7d22 4d0947f 0d0b783 4d0947f cbb7d22 9c815d5 9e85362 0d0b783 4d0947f cbb7d22 0d0b783 d93b7e1 cbb7d22 4d0947f 928a3cd 0d0b783 50ad747 9c815d5 4d0947f 9e85362 9c815d5 0d0b783 81f4a18 cbb7d22 0d0b783 9e85362 4d0947f 9e85362 50ad747 9e85362 50ad747 d93b7e1 50ad747 9c815d5 cbb7d22 9c815d5 d93b7e1 0d0b783 d93b7e1 50ad747 928a3cd 0d0b783 cbb7d22 928a3cd d93b7e1 9e85362 62d34db 81f4a18 2b1b52c cbb7d22 9e85362 cbb7d22 d93b7e1 928a3cd ad5f036 9c815d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import os
import gradio as gr
from transformers import pipeline
import pdfplumber
import docx2txt
from docx import Document
from fpdf import FPDF
from langdetect import detect
import urllib.request
import gradio.themes as gt
# === 🗂️ Создание папок и установка шрифта ===
os.makedirs("/app/models", exist_ok=True)
FONT_PATH = "DejaVuSans.ttf"
# Если шрифт отсутствует — скачиваем
if not os.path.exists(FONT_PATH):
print("⬇️ Загружаю шрифт DejaVuSans.ttf ...")
urllib.request.urlretrieve(
"https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSans.ttf",
FONT_PATH
)
# === ⚙️ Загрузка моделей один раз при запуске ===
def load_model(task, model_name):
print(f"🔹 Загружается модель: {model_name}")
return pipeline(task, model=model_name, cache_dir="/app/models")
summarizers = {
"en": load_model("summarization", "facebook/bart-large-cnn"),
"ru": load_model("summarization", "IlyaGusev/mbart_ru_sum_gazeta"),
"kz": load_model("summarization", "csebuetnlp/mT5_multilingual_XLSum")
}
# === 📄 Функция чтения текстов из файлов ===
def read_file(file):
if not file:
return ""
filename = file.name.lower()
text = ""
try:
if filename.endswith(".pdf"):
with pdfplumber.open(file.name) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
elif filename.endswith(".docx"):
text = docx2txt.process(file.name)
else:
text = file.read().decode("utf-8", errors="ignore")
except Exception as e:
return f"⚠️ Ошибка при чтении файла: {e}"
return text.strip()
# === 🌐 Определение языка ===
def detect_language(text):
try:
lang = detect(text)
if lang.startswith("ru"):
return "ru"
elif lang.startswith("kk") or any(x in text for x in "әіңғүұқөһ"):
return "kz"
else:
return "en"
except:
return "en"
# === 🧠 Суммаризация текста ===
def summarize_text(text):
if not text or len(text) < 50:
return "⚠️ Недостаточно текста для анализа.", "❌", "❌", 0, 0, "❌"
lang = detect_language(text)
model = summarizers.get(lang, summarizers["en"])
flags = {"ru": "🇷🇺 Русский", "kz": "🇰🇿 Қазақ тілі", "en": "🇬🇧 English"}
lang_label = flags.get(lang, "🌍 Unknown")
model_label = model.model.name_or_path if hasattr(model.model, "name_or_path") else "Custom"
chunk_size = 2500
overlap = 200
summaries = []
for i in range(0, len(text), chunk_size - overlap):
chunk = text[i:i + chunk_size]
try:
result = model(chunk, max_length=180, min_length=40, do_sample=False)
summaries.append(result[0]['summary_text'])
except Exception as e:
summaries.append(f"[Ошибка в части {len(summaries)+1}: {e}]")
summary = "\n\n".join(summaries).strip()
src_len = len(text)
sum_len = len(summary)
compression = round(100 * (1 - sum_len / src_len), 1) if src_len > 0 else 0
return summary, lang_label, model_label, src_len, sum_len, f"{compression}%"
# === 💾 Сохранение файлов ===
def save_summary_as_txt(summary_text):
path = "summary.txt"
with open(path, "w", encoding="utf-8") as f:
f.write(summary_text)
return path
def save_summary_as_docx(summary_text):
path = "summary.docx"
doc = Document()
doc.add_heading("Резюме документа", level=1)
doc.add_paragraph(summary_text)
doc.save(path)
return path
def save_summary_as_pdf(summary_text):
path = "summary.pdf"
pdf = FPDF()
pdf.add_page()
pdf.add_font('DejaVu', '', FONT_PATH, uni=True)
pdf.set_font('DejaVu', '', 12)
pdf.multi_cell(0, 10, summary_text)
pdf.output(path)
return path
# === 🚀 Главная функция обработки ===
def summarize_file(file):
text = read_file(file)
if text.startswith("⚠️"):
return text, "❌", "❌", 0, 0, "❌", None, None, None
with gr.Progress(track_tqdm=True) as progress:
progress(0, desc="🧠 Анализ текста...")
summary, lang_label, model_label, src_len, sum_len, compression = summarize_text(text)
progress(1, desc="✅ Готово!")
txt_path = save_summary_as_txt(summary)
docx_path = save_summary_as_docx(summary)
pdf_path = save_summary_as_pdf(summary)
return summary, lang_label, model_label, src_len, sum_len, compression, txt_path, docx_path, pdf_path
# === 🎨 Интерфейс Gradio ===
custom_theme = gt.Default(
primary_hue="purple",
secondary_hue="violet",
).set(
body_background_fill="#f8f6ff",
button_primary_background_fill="linear-gradient(90deg, #7e3ff2, #c084fc)",
button_primary_background_fill_hover="linear-gradient(90deg, #6b21a8, #9333ea)",
)
demo = gr.Interface(
fn=summarize_file,
inputs=gr.File(label="📂 Загрузите документ (.pdf, .docx, .txt)"),
outputs=[
gr.Textbox(label="🧾 Краткое резюме"),
gr.Textbox(label="🌍 Определённый язык"),
gr.Textbox(label="🧠 Используемая модель"),
gr.Number(label="📄 Длина исходного текста"),
gr.Number(label="📝 Длина резюме"),
gr.Textbox(label="📉 Степень сокращения"),
gr.File(label="📄 Скачать TXT"),
gr.File(label="📘 Скачать DOCX"),
gr.File(label="📕 Скачать PDF"),
],
title="🧠 Eroha Summarizer PRO (Автономная версия)",
description="🚀 Определяет язык (🇷🇺 / 🇰🇿 / 🇬🇧), создаёт краткое резюме и сохраняет в TXT, DOCX, PDF с поддержкой кириллицы.",
theme=custom_theme,
allow_flagging="never"
)
demo.launch(server_name="0.0.0.0", server_port=7860)
|