Spaces:
Running
Running
File size: 5,205 Bytes
2b1b52c ad5f036 71b5660 9c815d5 9e85362 50ad747 4d0947f d93b7e1 11a33db 5a1bb5b 2b1b52c cbb7d22 2b1b52c 5a1bb5b 03170fb 5a1bb5b 2b1b52c 5a1bb5b 4d0947f 71b5660 5a1bb5b 9c815d5 0d0b783 928a3cd 5a1bb5b 50ad747 9e85362 5a1bb5b 9e85362 5a1bb5b 9e85362 5a1bb5b 9e85362 9c815d5 5a1bb5b 4d0947f 5a1bb5b 4d0947f 5a1bb5b 9c815d5 9e85362 5a1bb5b 0d0b783 4d0947f 5a1bb5b 0d0b783 5a1bb5b a669d98 4d0947f 928a3cd 0d0b783 50ad747 5a1bb5b 9c815d5 5a1bb5b 9c815d5 0d0b783 5a1bb5b 9c815d5 d93b7e1 0d0b783 5a1bb5b 928a3cd 5a1bb5b a669d98 5a1bb5b 03170fb 206e908 5a1bb5b 03170fb 5a1bb5b 03170fb 206e908 5a1bb5b 206e908 ad5f036 9c815d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import os
import gradio as gr
from transformers import pipeline
import pdfplumber
import docx2txt
from docx import Document
from fpdf import FPDF
from langdetect import detect
import urllib.request
# === Настройки ===
os.makedirs("/app/models", exist_ok=True)
FONT_PATH = "DejaVuSans.ttf"
if not os.path.exists(FONT_PATH):
urllib.request.urlretrieve(
"https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSans.ttf",
FONT_PATH
)
# === Модели ===
def load_model(task, name):
print(f"🔹 Загружается модель: {name}")
return pipeline(task, model=name, cache_dir="/app/models")
models = {
"en": load_model("summarization", "facebook/bart-large-cnn"),
"ru": load_model("summarization", "IlyaGusev/mbart_ru_sum_gazeta"),
"kz": load_model("summarization", "csebuetnlp/mT5_multilingual_XLSum"),
}
# === Чтение файлов ===
def read_file(file):
if not file:
return ""
path = file.name.lower()
text = ""
try:
if path.endswith(".pdf"):
with pdfplumber.open(file.name) as pdf:
for page in pdf.pages:
t = page.extract_text()
if t:
text += t + "\n"
elif path.endswith(".docx"):
text = docx2txt.process(file.name)
else:
text = file.read().decode("utf-8", errors="ignore")
except Exception as e:
return f"⚠️ Ошибка при чтении: {e}"
return text.strip()
# === Определение языка ===
def detect_language(text):
try:
lang = detect(text)
if lang.startswith("ru"):
return "ru"
elif lang.startswith("kk") or any(c in text for c in "әіңғүұқөһ"):
return "kz"
else:
return "en"
except:
return "en"
# === Суммаризация ===
def summarize_text(text):
if not text or len(text) < 50:
return "⚠️ Недостаточно текста", "❌", "❌", 0, 0, "❌"
lang = detect_language(text)
model = models.get(lang, models["en"])
flags = {"ru": "🇷🇺 Русский", "kz": "🇰🇿 Қазақ тілі", "en": "🇬🇧 English"}
lang_label = flags.get(lang, "🌍 Unknown")
model_label = getattr(model.model, "name_or_path", "Custom")
chunk_size = 2500
overlap = 200
summaries = []
for i in range(0, len(text), chunk_size - overlap):
chunk = text[i:i+chunk_size]
try:
res = model(chunk, max_length=180, min_length=40, do_sample=False)
summaries.append(res[0]['summary_text'])
except Exception as e:
summaries.append(f"[Ошибка в части {len(summaries)+1}: {e}]")
summary = "\n\n".join(summaries).strip()
src_len = len(text)
sum_len = len(summary)
comp = f"{round(100*(1 - sum_len/src_len),1)}%" if src_len else "0%"
return summary, lang_label, model_label, src_len, sum_len, comp
# === Сохранение ===
def save_summary(summary):
txt, docx, pdf = "summary.txt", "summary.docx", "summary.pdf"
with open(txt, "w", encoding="utf-8") as f:
f.write(summary)
d = Document()
d.add_heading("Резюме", 0)
d.add_paragraph(summary)
d.save(docx)
pdf_doc = FPDF()
pdf_doc.add_page()
pdf_doc.add_font('DejaVu', '', FONT_PATH, uni=True)
pdf_doc.set_font('DejaVu', '', 12)
pdf_doc.multi_cell(0, 10, summary)
pdf_doc.output(pdf)
return txt, docx, pdf
# === Основная функция ===
def summarize_file(file, progress=gr.Progress()):
text = read_file(file)
if text.startswith("⚠️"):
return text, "❌", "❌", 0, 0, "❌", None, None, None
progress(0.2, "🧠 Анализ текста...")
summary, lang, model, src, summ, comp = summarize_text(text)
progress(0.8, "📄 Сохранение файлов...")
txt, docx, pdf = save_summary(summary)
progress(1, "✅ Готово!")
return summary, lang, model, src, summ, comp, txt, docx, pdf
# === Интерфейс ===
with gr.Blocks() as demo:
gr.Markdown("<h2 style='text-align:center'>🧠 Eroha Summarizer PRO</h2>")
file_input = gr.File(label="📂 Загрузите документ (.pdf, .docx, .txt)")
run_btn = gr.Button("🔍 Сгенерировать резюме", variant="primary")
summary_out = gr.Textbox(label="🧾 Краткое резюме", lines=10)
lang_out = gr.Textbox(label="🌍 Определённый язык")
model_out = gr.Textbox(label="🧠 Используемая модель")
src_len = gr.Number(label="📄 Длина исходного текста")
sum_len = gr.Number(label="📝 Длина резюме")
comp = gr.Textbox(label="📉 Степень сокращения")
txt = gr.File(label="📄 TXT")
docx = gr.File(label="📘 DOCX")
pdf = gr.File(label="📕 PDF")
run_btn.click(
summarize_file,
inputs=file_input,
outputs=[summary_out, lang_out, model_out, src_len, sum_len, comp, txt, docx, pdf],
)
demo.launch(server_name="0.0.0.0", server_port=7860)
|