File size: 5,205 Bytes
2b1b52c
ad5f036
71b5660
 
9c815d5
9e85362
50ad747
4d0947f
d93b7e1
11a33db
5a1bb5b
2b1b52c
 
cbb7d22
2b1b52c
 
 
 
 
 
5a1bb5b
 
 
 
03170fb
5a1bb5b
2b1b52c
 
5a1bb5b
4d0947f
71b5660
5a1bb5b
9c815d5
0d0b783
928a3cd
5a1bb5b
50ad747
9e85362
5a1bb5b
9e85362
 
5a1bb5b
 
 
 
9e85362
 
 
 
5a1bb5b
9e85362
9c815d5
5a1bb5b
4d0947f
 
 
 
 
5a1bb5b
4d0947f
 
 
 
 
 
5a1bb5b
9c815d5
9e85362
5a1bb5b
0d0b783
4d0947f
5a1bb5b
0d0b783
 
5a1bb5b
a669d98
4d0947f
 
928a3cd
0d0b783
50ad747
5a1bb5b
9c815d5
5a1bb5b
 
9c815d5
0d0b783
 
 
 
 
5a1bb5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c815d5
d93b7e1
0d0b783
 
5a1bb5b
 
 
 
 
 
928a3cd
5a1bb5b
a669d98
5a1bb5b
03170fb
206e908
5a1bb5b
 
 
03170fb
 
5a1bb5b
 
 
 
03170fb
206e908
 
5a1bb5b
 
206e908
ad5f036
9c815d5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import gradio as gr
from transformers import pipeline
import pdfplumber
import docx2txt
from docx import Document
from fpdf import FPDF
from langdetect import detect
import urllib.request

# === Настройки ===
os.makedirs("/app/models", exist_ok=True)
FONT_PATH = "DejaVuSans.ttf"

if not os.path.exists(FONT_PATH):
    urllib.request.urlretrieve(
        "https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSans.ttf",
        FONT_PATH
    )

# === Модели ===
def load_model(task, name):
    print(f"🔹 Загружается модель: {name}")
    return pipeline(task, model=name, cache_dir="/app/models")

models = {
    "en": load_model("summarization", "facebook/bart-large-cnn"),
    "ru": load_model("summarization", "IlyaGusev/mbart_ru_sum_gazeta"),
    "kz": load_model("summarization", "csebuetnlp/mT5_multilingual_XLSum"),
}

# === Чтение файлов ===
def read_file(file):
    if not file:
        return ""
    path = file.name.lower()
    text = ""
    try:
        if path.endswith(".pdf"):
            with pdfplumber.open(file.name) as pdf:
                for page in pdf.pages:
                    t = page.extract_text()
                    if t:
                        text += t + "\n"
        elif path.endswith(".docx"):
            text = docx2txt.process(file.name)
        else:
            text = file.read().decode("utf-8", errors="ignore")
    except Exception as e:
        return f"⚠️ Ошибка при чтении: {e}"
    return text.strip()

# === Определение языка ===
def detect_language(text):
    try:
        lang = detect(text)
        if lang.startswith("ru"):
            return "ru"
        elif lang.startswith("kk") or any(c in text for c in "әіңғүұқөһ"):
            return "kz"
        else:
            return "en"
    except:
        return "en"

# === Суммаризация ===
def summarize_text(text):
    if not text or len(text) < 50:
        return "⚠️ Недостаточно текста", "❌", "❌", 0, 0, "❌"

    lang = detect_language(text)
    model = models.get(lang, models["en"])
    flags = {"ru": "🇷🇺 Русский", "kz": "🇰🇿 Қазақ тілі", "en": "🇬🇧 English"}
    lang_label = flags.get(lang, "🌍 Unknown")
    model_label = getattr(model.model, "name_or_path", "Custom")

    chunk_size = 2500
    overlap = 200
    summaries = []

    for i in range(0, len(text), chunk_size - overlap):
        chunk = text[i:i+chunk_size]
        try:
            res = model(chunk, max_length=180, min_length=40, do_sample=False)
            summaries.append(res[0]['summary_text'])
        except Exception as e:
            summaries.append(f"[Ошибка в части {len(summaries)+1}: {e}]")

    summary = "\n\n".join(summaries).strip()
    src_len = len(text)
    sum_len = len(summary)
    comp = f"{round(100*(1 - sum_len/src_len),1)}%" if src_len else "0%"
    return summary, lang_label, model_label, src_len, sum_len, comp

# === Сохранение ===
def save_summary(summary):
    txt, docx, pdf = "summary.txt", "summary.docx", "summary.pdf"
    with open(txt, "w", encoding="utf-8") as f:
        f.write(summary)
    d = Document()
    d.add_heading("Резюме", 0)
    d.add_paragraph(summary)
    d.save(docx)
    pdf_doc = FPDF()
    pdf_doc.add_page()
    pdf_doc.add_font('DejaVu', '', FONT_PATH, uni=True)
    pdf_doc.set_font('DejaVu', '', 12)
    pdf_doc.multi_cell(0, 10, summary)
    pdf_doc.output(pdf)
    return txt, docx, pdf

# === Основная функция ===
def summarize_file(file, progress=gr.Progress()):
    text = read_file(file)
    if text.startswith("⚠️"):
        return text, "❌", "❌", 0, 0, "❌", None, None, None

    progress(0.2, "🧠 Анализ текста...")
    summary, lang, model, src, summ, comp = summarize_text(text)
    progress(0.8, "📄 Сохранение файлов...")
    txt, docx, pdf = save_summary(summary)
    progress(1, "✅ Готово!")
    return summary, lang, model, src, summ, comp, txt, docx, pdf

# === Интерфейс ===
with gr.Blocks() as demo:
    gr.Markdown("<h2 style='text-align:center'>🧠 Eroha Summarizer PRO</h2>")
    file_input = gr.File(label="📂 Загрузите документ (.pdf, .docx, .txt)")
    run_btn = gr.Button("🔍 Сгенерировать резюме", variant="primary")
    summary_out = gr.Textbox(label="🧾 Краткое резюме", lines=10)
    lang_out = gr.Textbox(label="🌍 Определённый язык")
    model_out = gr.Textbox(label="🧠 Используемая модель")
    src_len = gr.Number(label="📄 Длина исходного текста")
    sum_len = gr.Number(label="📝 Длина резюме")
    comp = gr.Textbox(label="📉 Степень сокращения")
    txt = gr.File(label="📄 TXT")
    docx = gr.File(label="📘 DOCX")
    pdf = gr.File(label="📕 PDF")

    run_btn.click(
        summarize_file,
        inputs=file_input,
        outputs=[summary_out, lang_out, model_out, src_len, sum_len, comp, txt, docx, pdf],
    )

demo.launch(server_name="0.0.0.0", server_port=7860)