eroha-agentapi / app.py
Yermek68's picture
Update app.py
5a1bb5b verified
raw
history blame
5.21 kB
import os
import gradio as gr
from transformers import pipeline
import pdfplumber
import docx2txt
from docx import Document
from fpdf import FPDF
from langdetect import detect
import urllib.request
# === Настройки ===
os.makedirs("/app/models", exist_ok=True)
FONT_PATH = "DejaVuSans.ttf"
if not os.path.exists(FONT_PATH):
urllib.request.urlretrieve(
"https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSans.ttf",
FONT_PATH
)
# === Модели ===
def load_model(task, name):
print(f"🔹 Загружается модель: {name}")
return pipeline(task, model=name, cache_dir="/app/models")
models = {
"en": load_model("summarization", "facebook/bart-large-cnn"),
"ru": load_model("summarization", "IlyaGusev/mbart_ru_sum_gazeta"),
"kz": load_model("summarization", "csebuetnlp/mT5_multilingual_XLSum"),
}
# === Чтение файлов ===
def read_file(file):
if not file:
return ""
path = file.name.lower()
text = ""
try:
if path.endswith(".pdf"):
with pdfplumber.open(file.name) as pdf:
for page in pdf.pages:
t = page.extract_text()
if t:
text += t + "\n"
elif path.endswith(".docx"):
text = docx2txt.process(file.name)
else:
text = file.read().decode("utf-8", errors="ignore")
except Exception as e:
return f"⚠️ Ошибка при чтении: {e}"
return text.strip()
# === Определение языка ===
def detect_language(text):
try:
lang = detect(text)
if lang.startswith("ru"):
return "ru"
elif lang.startswith("kk") or any(c in text for c in "әіңғүұқөһ"):
return "kz"
else:
return "en"
except:
return "en"
# === Суммаризация ===
def summarize_text(text):
if not text or len(text) < 50:
return "⚠️ Недостаточно текста", "❌", "❌", 0, 0, "❌"
lang = detect_language(text)
model = models.get(lang, models["en"])
flags = {"ru": "🇷🇺 Русский", "kz": "🇰🇿 Қазақ тілі", "en": "🇬🇧 English"}
lang_label = flags.get(lang, "🌍 Unknown")
model_label = getattr(model.model, "name_or_path", "Custom")
chunk_size = 2500
overlap = 200
summaries = []
for i in range(0, len(text), chunk_size - overlap):
chunk = text[i:i+chunk_size]
try:
res = model(chunk, max_length=180, min_length=40, do_sample=False)
summaries.append(res[0]['summary_text'])
except Exception as e:
summaries.append(f"[Ошибка в части {len(summaries)+1}: {e}]")
summary = "\n\n".join(summaries).strip()
src_len = len(text)
sum_len = len(summary)
comp = f"{round(100*(1 - sum_len/src_len),1)}%" if src_len else "0%"
return summary, lang_label, model_label, src_len, sum_len, comp
# === Сохранение ===
def save_summary(summary):
txt, docx, pdf = "summary.txt", "summary.docx", "summary.pdf"
with open(txt, "w", encoding="utf-8") as f:
f.write(summary)
d = Document()
d.add_heading("Резюме", 0)
d.add_paragraph(summary)
d.save(docx)
pdf_doc = FPDF()
pdf_doc.add_page()
pdf_doc.add_font('DejaVu', '', FONT_PATH, uni=True)
pdf_doc.set_font('DejaVu', '', 12)
pdf_doc.multi_cell(0, 10, summary)
pdf_doc.output(pdf)
return txt, docx, pdf
# === Основная функция ===
def summarize_file(file, progress=gr.Progress()):
text = read_file(file)
if text.startswith("⚠️"):
return text, "❌", "❌", 0, 0, "❌", None, None, None
progress(0.2, "🧠 Анализ текста...")
summary, lang, model, src, summ, comp = summarize_text(text)
progress(0.8, "📄 Сохранение файлов...")
txt, docx, pdf = save_summary(summary)
progress(1, "✅ Готово!")
return summary, lang, model, src, summ, comp, txt, docx, pdf
# === Интерфейс ===
with gr.Blocks() as demo:
gr.Markdown("<h2 style='text-align:center'>🧠 Eroha Summarizer PRO</h2>")
file_input = gr.File(label="📂 Загрузите документ (.pdf, .docx, .txt)")
run_btn = gr.Button("🔍 Сгенерировать резюме", variant="primary")
summary_out = gr.Textbox(label="🧾 Краткое резюме", lines=10)
lang_out = gr.Textbox(label="🌍 Определённый язык")
model_out = gr.Textbox(label="🧠 Используемая модель")
src_len = gr.Number(label="📄 Длина исходного текста")
sum_len = gr.Number(label="📝 Длина резюме")
comp = gr.Textbox(label="📉 Степень сокращения")
txt = gr.File(label="📄 TXT")
docx = gr.File(label="📘 DOCX")
pdf = gr.File(label="📕 PDF")
run_btn.click(
summarize_file,
inputs=file_input,
outputs=[summary_out, lang_out, model_out, src_len, sum_len, comp, txt, docx, pdf],
)
demo.launch(server_name="0.0.0.0", server_port=7860)