Yermek68 commited on
Commit
4d0947f
·
verified ·
1 Parent(s): ec8859f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -28
app.py CHANGED
@@ -4,18 +4,20 @@ import pdfplumber
4
  import docx2txt
5
  from docx import Document
6
  from fpdf import FPDF
 
7
 
8
- # Загружаем модель
9
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
 
 
 
10
 
11
  def read_file(file):
12
- """Чтение PDF, DOCX и TXT"""
13
  if file is None:
14
  return ""
15
-
16
  filename = file.name.lower()
17
  text = ""
18
-
19
  try:
20
  if filename.endswith(".pdf"):
21
  with pdfplumber.open(file.name) as pdf:
@@ -29,76 +31,74 @@ def read_file(file):
29
  text = file.read().decode("utf-8", errors="ignore")
30
  except Exception as e:
31
  return f"Ошибка при чтении файла: {str(e)}"
32
-
33
  return text.strip()
34
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def summarize_text(text):
36
- """Суммаризация с безопасным делением на части"""
37
  if not text or len(text) < 50:
38
  return "⚠️ Недостаточно текста для суммаризации."
39
-
40
- chunk_size = 2500 # безопасная длина
41
- overlap = 200 # перекрытие между кусками
 
42
  summaries = []
43
-
44
  for i in range(0, len(text), chunk_size - overlap):
45
  chunk = text[i:i + chunk_size]
46
  try:
47
- result = summarizer(chunk, max_length=180, min_length=40, do_sample=False)
48
  summaries.append(result[0]['summary_text'])
49
  except Exception as e:
50
- summaries.append(f"[Ошибка в части {len(summaries) + 1}: {str(e)}]")
51
-
52
- final_summary = "\n\n".join(summaries)
53
- return final_summary.strip()
54
 
55
  def save_summary_as_docx(summary_text):
56
- """Сохранение в DOCX"""
57
  doc = Document()
58
  doc.add_heading("Резюме документа", level=1)
59
  doc.add_paragraph(summary_text)
60
- path = "summary.docx"
61
  doc.save(path)
62
  return path
63
 
64
  def save_summary_as_txt(summary_text):
65
- """Сохранение в TXT"""
66
  path = "summary.txt"
67
  with open(path, "w", encoding="utf-8") as f:
68
  f.write(summary_text)
69
  return path
70
 
71
  def save_summary_as_pdf(summary_text):
72
- """Сохранение в PDF с русским шрифтом"""
73
  path = "summary.pdf"
74
  pdf = FPDF()
75
  pdf.add_page()
76
-
77
- # Используем встроенный шрифт DejaVuSans.ttf, который нужно будет добавить в проект
78
  try:
79
  pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True)
80
  pdf.set_font('DejaVu', '', 12)
81
  except:
82
  pdf.set_font("Arial", size=12)
83
-
84
  pdf.multi_cell(0, 10, summary_text)
85
  pdf.output(path)
86
  return path
87
 
88
  def summarize_file(file):
89
- """Главная функция"""
90
  text = read_file(file)
91
  if text.startswith("Ошибка"):
92
  return text, None, None, None
93
-
94
  summary = summarize_text(text)
95
  txt_path = save_summary_as_txt(summary)
96
  docx_path = save_summary_as_docx(summary)
97
  pdf_path = save_summary_as_pdf(summary)
98
-
99
  return summary, txt_path, docx_path, pdf_path
100
 
101
- # Интерфейс Gradio
102
  demo = gr.Interface(
103
  fn=summarize_file,
104
  inputs=gr.File(label="Загрузите документ (.pdf, .docx или .txt)"),
@@ -109,7 +109,7 @@ demo = gr.Interface(
109
  gr.File(label="📕 Скачать PDF")
110
  ],
111
  title="🧠 Eroha Summarizer",
112
- description="Загрузите PDF, DOCX или TXT файл. Модель создаст краткое резюме и позволит скачать результат в любом формате.",
113
  )
114
 
115
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
4
  import docx2txt
5
  from docx import Document
6
  from fpdf import FPDF
7
+ from langdetect import detect
8
 
9
+ # Модели суммаризации
10
+ summarizers = {
11
+ "en": pipeline("summarization", model="facebook/bart-large-cnn"),
12
+ "ru": pipeline("summarization", model="IlyaGusev/mbart_ru_sum_gazeta"),
13
+ "kz": pipeline("summarization", model="csebuetnlp/mT5_multilingual_XLSum")
14
+ }
15
 
16
  def read_file(file):
 
17
  if file is None:
18
  return ""
 
19
  filename = file.name.lower()
20
  text = ""
 
21
  try:
22
  if filename.endswith(".pdf"):
23
  with pdfplumber.open(file.name) as pdf:
 
31
  text = file.read().decode("utf-8", errors="ignore")
32
  except Exception as e:
33
  return f"Ошибка при чтении файла: {str(e)}"
 
34
  return text.strip()
35
 
36
+ def detect_language(text):
37
+ try:
38
+ lang = detect(text)
39
+ if lang.startswith("ru"):
40
+ return "ru"
41
+ elif lang.startswith("kk") or "қ" in text or "ә" in text or "ң" in text:
42
+ return "kz"
43
+ else:
44
+ return "en"
45
+ except:
46
+ return "en"
47
+
48
  def summarize_text(text):
 
49
  if not text or len(text) < 50:
50
  return "⚠️ Недостаточно текста для суммаризации."
51
+ lang = detect_language(text)
52
+ model = summarizers.get(lang, summarizers["en"])
53
+ chunk_size = 2500
54
+ overlap = 200
55
  summaries = []
 
56
  for i in range(0, len(text), chunk_size - overlap):
57
  chunk = text[i:i + chunk_size]
58
  try:
59
+ result = model(chunk, max_length=180, min_length=40, do_sample=False)
60
  summaries.append(result[0]['summary_text'])
61
  except Exception as e:
62
+ summaries.append(f"[Ошибка в части {len(summaries)+1}: {str(e)}]")
63
+ return "\n\n".join(summaries).strip()
 
 
64
 
65
  def save_summary_as_docx(summary_text):
66
+ path = "summary.docx"
67
  doc = Document()
68
  doc.add_heading("Резюме документа", level=1)
69
  doc.add_paragraph(summary_text)
 
70
  doc.save(path)
71
  return path
72
 
73
  def save_summary_as_txt(summary_text):
 
74
  path = "summary.txt"
75
  with open(path, "w", encoding="utf-8") as f:
76
  f.write(summary_text)
77
  return path
78
 
79
  def save_summary_as_pdf(summary_text):
 
80
  path = "summary.pdf"
81
  pdf = FPDF()
82
  pdf.add_page()
 
 
83
  try:
84
  pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True)
85
  pdf.set_font('DejaVu', '', 12)
86
  except:
87
  pdf.set_font("Arial", size=12)
 
88
  pdf.multi_cell(0, 10, summary_text)
89
  pdf.output(path)
90
  return path
91
 
92
  def summarize_file(file):
 
93
  text = read_file(file)
94
  if text.startswith("Ошибка"):
95
  return text, None, None, None
 
96
  summary = summarize_text(text)
97
  txt_path = save_summary_as_txt(summary)
98
  docx_path = save_summary_as_docx(summary)
99
  pdf_path = save_summary_as_pdf(summary)
 
100
  return summary, txt_path, docx_path, pdf_path
101
 
 
102
  demo = gr.Interface(
103
  fn=summarize_file,
104
  inputs=gr.File(label="Загрузите документ (.pdf, .docx или .txt)"),
 
109
  gr.File(label="📕 Скачать PDF")
110
  ],
111
  title="🧠 Eroha Summarizer",
112
+ description="Автоматически определяет язык документа (RU / EN / KZ) и создаёт краткое резюме."
113
  )
114
 
115
  demo.launch(server_name="0.0.0.0", server_port=7860)