Upload 2 files
Browse files- app.py +26 -17
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -3,29 +3,30 @@ import gradio as gr
|
|
| 3 |
import torch
|
| 4 |
from peft import PeftModel, PeftConfig
|
| 5 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
|
|
|
| 6 |
|
| 7 |
-
def load_data(file_obj):
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
|
| 15 |
-
|
| 16 |
|
| 17 |
def preprocessing(data):
|
| 18 |
texts = list()
|
| 19 |
|
| 20 |
i = 0
|
| 21 |
-
if len(data) <= i+
|
| 22 |
texts = data
|
| 23 |
else:
|
| 24 |
while len(data[i:]) != 0:
|
| 25 |
-
if len(data[i:]) >
|
| 26 |
-
string = str(data[i:i+
|
| 27 |
texts.append(string)
|
| 28 |
-
i = i +
|
| 29 |
else:
|
| 30 |
string = str(data[i:])
|
| 31 |
texts.append(string)
|
|
@@ -40,13 +41,20 @@ tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
|
|
| 40 |
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map='auto') # load_in_8bit=True,
|
| 41 |
model = PeftModel.from_pretrained(model, peft_model_id, device_map='auto')
|
| 42 |
|
| 43 |
-
def summarize(
|
| 44 |
-
transcript = load_data(file_obj)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
texts = preprocessing(transcript)
|
| 46 |
inputs = tokenizer(texts, return_tensors="pt", padding=True, )
|
| 47 |
|
| 48 |
with torch.no_grad():
|
| 49 |
-
output_tokens = model.generate(input_ids=inputs["input_ids"].to("
|
| 50 |
outputs = tokenizer.batch_decode(output_tokens.detach().cpu().numpy(), skip_special_tokens=True)
|
| 51 |
|
| 52 |
return outputs
|
|
@@ -54,6 +62,7 @@ def summarize(file_obj):
|
|
| 54 |
gr.Interface(
|
| 55 |
fn=summarize,
|
| 56 |
title = 'Summarize Transcripts',
|
| 57 |
-
inputs = gr.File(file_types=["text"], label="Upload a text file.", interactive=True),
|
|
|
|
| 58 |
outputs = gr.Textbox(label="Summary", max_lines=120, interactive=False),
|
| 59 |
-
).launch()
|
|
|
|
| 3 |
import torch
|
| 4 |
from peft import PeftModel, PeftConfig
|
| 5 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
| 6 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 7 |
|
| 8 |
+
# def load_data(file_obj):
|
| 9 |
+
# """
|
| 10 |
+
# Load data from the file object of the gr.File() inputs
|
| 11 |
+
# """
|
| 12 |
+
# path = file_obj.name
|
| 13 |
+
# with open(path, "r") as f:
|
| 14 |
+
# data = f.read()
|
| 15 |
|
| 16 |
+
# return data
|
| 17 |
|
| 18 |
def preprocessing(data):
|
| 19 |
texts = list()
|
| 20 |
|
| 21 |
i = 0
|
| 22 |
+
if len(data) <= i+3000:
|
| 23 |
texts = data
|
| 24 |
else:
|
| 25 |
while len(data[i:]) != 0:
|
| 26 |
+
if len(data[i:]) > 3000:
|
| 27 |
+
string = str(data[i:i+3000])
|
| 28 |
texts.append(string)
|
| 29 |
+
i = i + 2800
|
| 30 |
else:
|
| 31 |
string = str(data[i:])
|
| 32 |
texts.append(string)
|
|
|
|
| 41 |
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map='auto') # load_in_8bit=True,
|
| 42 |
model = PeftModel.from_pretrained(model, peft_model_id, device_map='auto')
|
| 43 |
|
| 44 |
+
def summarize(video_id):
|
| 45 |
+
# transcript = load_data(file_obj)
|
| 46 |
+
dict = YouTubeTranscriptApi.get_transcript(video_id)
|
| 47 |
+
|
| 48 |
+
transcript = ""
|
| 49 |
+
|
| 50 |
+
for i in range(len(dict)):
|
| 51 |
+
transcript += dict[i]['text']
|
| 52 |
+
|
| 53 |
texts = preprocessing(transcript)
|
| 54 |
inputs = tokenizer(texts, return_tensors="pt", padding=True, )
|
| 55 |
|
| 56 |
with torch.no_grad():
|
| 57 |
+
output_tokens = model.generate(input_ids=inputs["input_ids"].to("device"), max_new_tokens=60, do_sample=True, top_p=0.9)
|
| 58 |
outputs = tokenizer.batch_decode(output_tokens.detach().cpu().numpy(), skip_special_tokens=True)
|
| 59 |
|
| 60 |
return outputs
|
|
|
|
| 62 |
gr.Interface(
|
| 63 |
fn=summarize,
|
| 64 |
title = 'Summarize Transcripts',
|
| 65 |
+
# inputs = gr.File(file_types=["text"], label="Upload a text file.", interactive=True),
|
| 66 |
+
inputs = gr.Textbox(label="Video_ID", interactive=True),
|
| 67 |
outputs = gr.Textbox(label="Summary", max_lines=120, interactive=False),
|
| 68 |
+
).launch(debug=True)
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
peft
|
| 2 |
transformers==4.27.2
|
| 3 |
gradio
|
|
|
|
|
|
| 1 |
peft
|
| 2 |
transformers==4.27.2
|
| 3 |
gradio
|
| 4 |
+
youtube_transcript_api
|