Iskabore commited on
Commit
9057a10
Β·
1 Parent(s): 5f53c31

create summarizer module

Browse files
summarizer/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ INPUT_MIN_SIZE = 17
summarizer/models.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
2
+
3
+ # Load French summarizer
4
+ fr_model_name = "plguillou/t5-base-fr-sum-cnndm"
5
+ tokenizer_fr = AutoTokenizer.from_pretrained(fr_model_name)
6
+ model_fr = AutoModelForSeq2SeqLM.from_pretrained(fr_model_name)
7
+ summarizer_fr = pipeline("summarization", model=model_fr, tokenizer=tokenizer_fr)
8
+
9
+ # Load English summarizer
10
+ en_model_name = "facebook/bart-large-cnn"
11
+ tokenizer_en = AutoTokenizer.from_pretrained(en_model_name)
12
+ model_en = AutoModelForSeq2SeqLM.from_pretrained(en_model_name)
13
+ summarizer_en = pipeline("summarization", model=model_en, tokenizer=tokenizer_en)
summarizer/summarize.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from summarizer import INPUT_MIN_SIZE
2
+ from summarizer.models import summarizer_fr, summarizer_en
3
+ from summarizer.utils import detect_language, read_file
4
+
5
+
6
+ def generate_summary(text=None, file=None, min_length=30, max_length=100, do_sample=False):
7
+ content = text or ""
8
+ if file:
9
+ content = read_file(file)
10
+ content = content.strip()
11
+
12
+ if not content or len(content.split()) < INPUT_MIN_SIZE:
13
+ return "⚠️ Input too short or empty."
14
+
15
+ # Model selection based on language detection
16
+ lang = detect_language(content)
17
+ if lang == "fr":
18
+ summarizer = summarizer_fr
19
+ elif lang == "en":
20
+ summarizer = summarizer_en
21
+ else:
22
+ return f"❌ Unsupported language: {lang}"
23
+
24
+ try:
25
+ summary = summarizer(content, min_length=min_length, max_length=max_length, do_sample=do_sample)
26
+ return summary[0]["summary_text"]
27
+ except Exception as e:
28
+ return f"❌ Error: {str(e)}"
summarizer/utils.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import os
3
+ from langdetect import detect
4
+
5
+
6
+ def read_txt_file(filepath: str) -> str:
7
+ """Read content from a .txt file."""
8
+ try:
9
+ with open(filepath, "r", encoding="utf-8") as f:
10
+ content = f.read()
11
+ return content
12
+ except Exception as e:
13
+ print(f"❌ Error reading TXT file: {e}")
14
+ return ""
15
+
16
+
17
+ def read_pdf_file(filepath: str) -> str:
18
+ """Extract text from a PDF file using PyPDF2."""
19
+ try:
20
+ with open(filepath, "rb") as file:
21
+ pdf_reader = PyPDF2.PdfReader(file)
22
+ text = ""
23
+ for page_num in range(len(pdf_reader.pages)):
24
+ page = pdf_reader.pages[page_num]
25
+ text += page.extract_text()
26
+ return text.strip()
27
+ except Exception as e:
28
+ print(f"❌ Error reading PDF file: {e}")
29
+ return ""
30
+
31
+
32
+ def read_file(filepath: str) -> str:
33
+ """Read a file (txt or pdf) and return its content as text."""
34
+ if os.path.splitext(filepath)[1].lower() == ".txt":
35
+ return read_txt_file(filepath)
36
+ elif os.path.splitext(filepath)[1].lower() == ".pdf":
37
+ return read_pdf_file(filepath)
38
+ else:
39
+ print(f"❌ Unsupported file type: {filepath}")
40
+ return ""
41
+
42
+
43
+ def detect_language(text: str) -> str:
44
+ """Detect the language of the given text using langdetect."""
45
+ try:
46
+ return detect(text)
47
+ except Exception:
48
+ return "unknown"