geraskalnas remzicam commited on
Commit
8cb5fba
·
0 Parent(s):

Duplicate from remzicam/ted_talks_summarizer

Browse files

Co-authored-by: rc <remzicam@users.noreply.huggingface.co>

Files changed (5) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. TED.png +0 -0
  4. app.py +108 -0
  5. requirements.txt +4 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Ted Talks Summarizer
3
+ emoji: 🌖
4
+ colorFrom: pink
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.15.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: other
11
+ duplicated_from: remzicam/ted_talks_summarizer
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
TED.png ADDED
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """TED Talks Summarizer App."""
2
+
3
+ from re import sub
4
+
5
+ from gradio import Interface, Textbox
6
+ from requests import get
7
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
8
+
9
+ repo_id = "pszemraj/led-base-book-summary"
10
+
11
+ model = AutoModelForSeq2SeqLM.from_pretrained(
12
+ repo_id,
13
+ low_cpu_mem_usage=True,
14
+ )
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained(repo_id)
17
+
18
+ summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
19
+
20
+
21
+ def clean_text(text: str) -> str:
22
+ """Cleans subtitle text of ted talks.
23
+
24
+ Args:
25
+ text (str): subtitle of ted talk
26
+
27
+ Returns:
28
+ cleaned_text (str): cleaned version of subtitle text
29
+ """
30
+ # remove string inside parantheses (i.e appluse)
31
+ text = sub(r"\(.*\)", "", text)
32
+ # format text by splitting/removing new lines
33
+ text = text.split("\n")[1:]
34
+ # remove empty strings
35
+ text = list(filter(None, text))
36
+ # remove timestamps as they contains pattern of "-->"
37
+ cleaned_text = " ".join([x.strip() for x in text if "-->" not in x])
38
+ return cleaned_text
39
+
40
+
41
+ def ted_talk_transcriber(link: str) -> str:
42
+ """Creates transcription of ted talks from url.
43
+
44
+ Args:
45
+ link (str): url link of ted talks
46
+
47
+ Returns:
48
+ raw_text (str): raw transcription of the ted talk
49
+ """
50
+ # request link of the talk
51
+ page = get(link)
52
+ # extract unique talk id to reach subtitle file
53
+ talk_id = str(page.content).split("project_masters/")[1].split("/")[0]
54
+ raw_text = get(
55
+ f"https://hls.ted.com/project_masters/{talk_id}/subtitles/en/full.vtt"
56
+ ).text
57
+ return raw_text
58
+
59
+
60
+ def text_summarizer(text: str) -> str:
61
+ """Summarizes given text.
62
+
63
+ Args:
64
+ text (str): ted talks transcription
65
+
66
+ Returns:
67
+ str: summary
68
+ """
69
+ result = summarizer(
70
+ text,
71
+ min_length=8,
72
+ max_length=256,
73
+ no_repeat_ngram_size=3,
74
+ encoder_no_repeat_ngram_size=3,
75
+ repetition_penalty=3.5,
76
+ num_beams=4,
77
+ do_sample=False,
78
+ early_stopping=True,
79
+ )
80
+ return result[0]["summary_text"]
81
+
82
+
83
+ def main(link: str) -> str:
84
+ """Summarizes ted talks given link.
85
+
86
+ Args:
87
+ link (str): url link of ted talks
88
+
89
+ Returns:
90
+ str: summary
91
+ """
92
+ raw_text = ted_talk_transcriber(link)
93
+ cleaned_transcript = clean_text(raw_text)
94
+ return text_summarizer(cleaned_transcript)
95
+
96
+
97
+ logo = "<center><img src='file/TED.png' width=180px></center>"
98
+
99
+ Interface(
100
+ main,
101
+ inputs=Textbox(label="Type the TED Talks link"),
102
+ examples=[
103
+ "https://www.ted.com/talks/jen_gunter_the_truth_about_yeast_in_your_body"
104
+ ],
105
+ outputs=Textbox(label="Summary"),
106
+ allow_flagging="never",
107
+ description=logo,
108
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ accelerate
2
+ --find-links https://download.pytorch.org/whl/torch_stable.html
3
+ torch==1.13.1+cpu
4
+ transformers