Spaces:

frknayk
/

YoutubeSummarizer

Runtime error

App Files Files Community

frknayk commited on Mar 23, 2024

Commit

8b6bb87

verified ·

1 Parent(s): 148d1c2

Upload app.py

Browse files

Files changed (1) hide show

app.py +163 -0

app.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import streamlit as st
+# Transcript
+from youtube_transcript_api import YouTubeTranscriptApi
+import os
+# Summarization
+from transformers import (
+    pipeline,
+    AutoModelForSpeechSeq2Seq,
+    AutoProcessor,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+)
+import torch
+import re
+def fetch_transcript(video_url):
+    try:
+        # Extract the video ID from the URL
+        video_id = video_url.split("v=")[1]
+        # Fetch the transcript for the video
+        transcript = YouTubeTranscriptApi.get_transcript(video_id)
+        # Process the transcript data
+        text_transcript = "\n".join([entry['text'] for entry in transcript])
+        return text_transcript
+    except Exception as e:
+        return str(e)
+def clean_transcript(transcript):
+    # Remove non-speech elements (e.g., laughter, background noises)
+    transcript = re.sub(r'\[.*?\]', '', transcript)
+    # Correct spelling and grammar (you can use libraries like NLTK or spaCy for this)
+    # Example:
+    # import nltk
+    # transcript = ' '.join(nltk.word_tokenize(transcript))
+    # Normalize punctuation and formatting
+    transcript = transcript.replace('\n', ' ')  # Remove line breaks
+    transcript = re.sub(r'\s+', ' ', transcript)  # Remove extra whitespaces
+    # Remove timestamps and annotations
+    transcript = re.sub(r'\[\d+:\d+:\d+\]', '', transcript)
+    # Handle speaker identification (if present)
+    # Example: transcript = re.sub(r'Speaker\d+:', '', transcript)
+    # Remove filler words and phrases
+    filler_words = ['like', 'you know', 'sort of']  # Add more as needed
+    for word in filler_words:
+        transcript = transcript.replace(word, '')
+    # Replace common contractions with their expanded forms
+    transcript = transcript.replace("won't", "will not")
+    transcript = transcript.replace("can't", "cannot")
+    transcript = transcript.replace("n't", " not")
+    transcript = transcript.replace("'ll", " will")
+    transcript = transcript.replace("'ve", " have")
+    transcript = transcript.replace("'re", " are")
+    transcript = transcript.replace("'d", " would")
+    transcript = transcript.replace("'s", " is")
+    return transcript.strip()  # Trim leading/trailing whitespaces
+def extract_video_id(url):
+    """Extracts the YouTube video ID from the URL."""
+    match = re.search(r"(?<=v=)[\w-]+", url)
+    if match:
+        return match.group(0)
+    else:
+        return None
+def summarize_transcript(text, llama_pipeline):
+    def summarize_text(llama_pipeline, system_prompt, text):
+        # Format the input text with special tokens for the model
+        text = f"""
+        <s>[INST] <<SYS>>
+        {system_prompt}
+        <</SYS>>
+        {text}[/INST]
+        """
+        # Generate sequences using the pipeline with specified parameters
+        sequences = llama_pipeline(text)
+        # Extract the generated text from the sequences
+        generated_text = sequences[0]["generated_text"]
+        # Trim the generated text to remove the instruction part
+        generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):]
+        # Return the processed generated text
+        return generated_text
+    # Define the maximum input length for each iteration of summarization
+    input_len = 1000
+    # Start an infinite loop to repeatedly summarize the text
+    while True:
+        # Print the current length of the text
+        print(len(text))
+        # Call the chat function to summarize the text. Only the first 'input_len' characters are considered for summarization
+        summary = summarize_text(llama_pipeline, "", "Summarize the following: " + text[0:input_len])
+        if len(summary) < input_len:
+            return summary
+        # Concatenate the current summary with the remaining part of the text for the next iteration
+        text = summary + " " + text[input_len:]
+# Load the model and tokenizer
+@st.cache_resource()
+def load_model():
+    # Define the model name to be used for the chat function
+    model_name = "meta-llama/Llama-2-7b-chat-hf"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    pipeline_llama2 = pipeline(
+        "text-generation", #task
+        model=model_name,
+        tokenizer=tokenizer,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        device_map="auto",
+        # max_length=max_token_length,
+        do_sample=True,
+        top_k=10,
+        num_return_sequences=1,
+        eos_token_id=tokenizer.eos_token_id
+    )
+    return pipeline_llama2
+def main():
+    st.title("YouTube Video Preview")
+    with st.spinner('Loading checkpoint shards of LLAMA-2'):
+        pipeline_llama2 = load_model()
+    st.success('Done!')
+    # Input field for the YouTube video link
+    youtube_url = st.text_input("Paste YouTube Video Link:")
+    # Extract video ID from the URL
+    video_id = extract_video_id(youtube_url)
+    # Display video preview if video ID is found
+    if video_id:
+        video_url = f"https://www.youtube.com/watch?v={video_id}"
+        st.video(video_url, format='video/mp4')
+        video_transcript = clean_transcript(fetch_transcript(video_url))
+        if video_transcript:
+            # Display transcript and summary side by side
+            col1, col2 = st.columns(2)
+            with col1:
+                st.subheader("Transcript:")
+                st.text_area(" ", video_transcript, height=400)
+            with col2:
+                st.subheader("Summary:")
+                video_summary = summarize_transcript(video_transcript, pipeline_llama2)
+                st.text_area(" ", video_summary, height=400)
+                print(f"Summary:{video_summary}")
+        else:
+            st.error("Failed to fetch video transcript. Please check the video ID or try again later.")
+    elif youtube_url:
+        st.warning("Invalid YouTube Video Link")
+if __name__ == "__main__":
+    main()