Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import onnxruntime | |
| from transformers import AutoTokenizer | |
| import torch | |
| import os | |
| from transformers import pipeline | |
| import subprocess | |
| import moviepy.editor as mp | |
| import base64 | |
| token = AutoTokenizer.from_pretrained('distilroberta-base') | |
| inf_session = onnxruntime.InferenceSession('classifier-quantized2.onnx') | |
| input_name = inf_session.get_inputs()[0].name | |
| output_name = inf_session.get_outputs()[0].name | |
| classes = ['Art', 'Astrology', 'Biology', 'Chemistry', 'Economics', 'History', 'Literature', 'Philosophy', 'Physics', 'Politics', 'Psychology', 'Sociology'] | |
| ### --- Audio/Video to txt ---### | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| pipe = pipeline("automatic-speech-recognition", | |
| model="openai/whisper-tiny.en", | |
| chunk_length_s=30, device=device) | |
| ### --- Text Summary --- ### | |
| summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device) | |
| def video_identity(video): | |
| transcription = pipe(video)["text"] | |
| return transcription | |
| def summary(text): | |
| text = text.split('.') | |
| max_chunk = 500 | |
| current_chunk = 0 | |
| chunks = [] | |
| for t in text: | |
| if len(chunks) == current_chunk + 1: | |
| if len(chunks[current_chunk]) + len(t.split(' ')) <= max_chunk: | |
| chunks[current_chunk].extend(t.split(' ')) | |
| else: | |
| current_chunk += 1 | |
| chunks.append(t.split(' ')) | |
| else: | |
| chunks.append(t.split(' ')) | |
| for chunk in range(len(chunks)): | |
| chunks[chunk] =' '.join(chunks[chunk]) | |
| summ = summarizer(chunks,max_length = 100) | |
| return summ | |
| def classify(video_file,encoded_video): | |
| if encoded_video != "": | |
| decoded_file_data = base64.b64decode(encoded_video) | |
| with open("temp_video.mp4", "wb") as f: | |
| f.write(decoded_file_data) | |
| video_file = "temp_video.mp4" | |
| clip = mp.VideoFileClip(video_file) | |
| clip.audio.write_audiofile(r"audio.wav") | |
| full_text = video_identity(r"audio.wav") | |
| sum = summary(full_text)[0]['summary_text'] | |
| input_ids = token(sum)['input_ids'][:512] | |
| logits = inf_session.run([output_name],{input_name : [input_ids]})[0] | |
| logits = torch.FloatTensor(logits) | |
| probs = torch.sigmoid(logits)[0] | |
| probs = list(probs) | |
| label = classes[probs.index(max(probs))] | |
| final = { | |
| 'text':full_text, | |
| 'summary':sum, | |
| 'label':label, | |
| } | |
| return final | |
| text1 = gr.Textbox(label="Text") | |
| text2 = gr.Textbox(label="Summary") | |
| iface = gr.Interface(fn=classify, | |
| inputs=['video','text'], | |
| outputs = ['json']) | |
| iface.launch(inline=False) | |