Spaces:
Sleeping
Sleeping
Create backend.py
Browse files- backend.py +104 -0
backend.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# backend.py
|
| 3 |
+
|
| 4 |
+
import spacy
|
| 5 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 6 |
+
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
|
| 7 |
+
from googleapiclient.discovery import build
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from wordcloud import WordCloud
|
| 10 |
+
import matplotlib.pyplot as plt
|
| 11 |
+
import re
|
| 12 |
+
|
| 13 |
+
# Initialize Spacy and VADER
|
| 14 |
+
nlp = spacy.load("en_core_web_sm")
|
| 15 |
+
sia = SentimentIntensityAnalyzer()
|
| 16 |
+
|
| 17 |
+
# YouTube Data API key
|
| 18 |
+
YOUTUBE_API_KEY = "YOUR_YOUTUBE_API_KEY"
|
| 19 |
+
|
| 20 |
+
# Fetch metadata of YouTube Video
|
| 21 |
+
def fetch_video_metadata(video_url):
|
| 22 |
+
video_id = video_url.split('v=')[-1]
|
| 23 |
+
youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
request = youtube.videos().list(part="snippet,statistics", id=video_id)
|
| 27 |
+
response = request.execute()
|
| 28 |
+
|
| 29 |
+
video_data = response['items'][0]
|
| 30 |
+
metadata = {
|
| 31 |
+
"channel_name": video_data['snippet']['channelTitle'],
|
| 32 |
+
"video_title": video_data['snippet']['title'],
|
| 33 |
+
"views": video_data['statistics']['viewCount'],
|
| 34 |
+
"likes": video_data['statistics'].get('likeCount', 'N/A'),
|
| 35 |
+
"dislikes": video_data['statistics'].get('dislikeCount', 'N/A'),
|
| 36 |
+
"posted_date": video_data['snippet']['publishedAt']
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
return metadata, None
|
| 40 |
+
except VideoUnavailable:
|
| 41 |
+
return None, "Video is unavailable."
|
| 42 |
+
except Exception as e:
|
| 43 |
+
return None, str(e)
|
| 44 |
+
|
| 45 |
+
# Fetch the transcript for YouTube Video
|
| 46 |
+
def fetch_transcript(video_url):
|
| 47 |
+
video_id = video_url.split('v=')[-1]
|
| 48 |
+
try:
|
| 49 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
| 50 |
+
text = " ".join([t['text'] for t in transcript])
|
| 51 |
+
return text, None
|
| 52 |
+
except (TranscriptsDisabled, VideoUnavailable):
|
| 53 |
+
return None, "Transcript not available for this video."
|
| 54 |
+
except Exception as e:
|
| 55 |
+
return None, str(e)
|
| 56 |
+
|
| 57 |
+
# Split long sentences into chunks for better processing
|
| 58 |
+
def split_long_sentences(text):
|
| 59 |
+
doc = nlp(text) # Tokenize into sentences using Spacy
|
| 60 |
+
sentences = []
|
| 61 |
+
|
| 62 |
+
for sent in doc.sents:
|
| 63 |
+
if len(sent.text.split()) > 25:
|
| 64 |
+
sub_sentences = []
|
| 65 |
+
current_chunk = []
|
| 66 |
+
for token in sent:
|
| 67 |
+
current_chunk.append(token.text)
|
| 68 |
+
if token.is_punct and token.text in {".", "!", "?"}:
|
| 69 |
+
sub_sentences.append(" ".join(current_chunk).strip())
|
| 70 |
+
current_chunk = []
|
| 71 |
+
elif token.text.lower() in {"and", "but", "because", "so"}:
|
| 72 |
+
if len(current_chunk) > 3:
|
| 73 |
+
sub_sentences.append(" ".join(current_chunk).strip())
|
| 74 |
+
current_chunk = []
|
| 75 |
+
|
| 76 |
+
if current_chunk:
|
| 77 |
+
sub_sentences.append(" ".join(current_chunk).strip())
|
| 78 |
+
sentences.extend(sub_sentences)
|
| 79 |
+
else:
|
| 80 |
+
sentences.append(sent.text.strip())
|
| 81 |
+
|
| 82 |
+
return sentences
|
| 83 |
+
|
| 84 |
+
# Read the keywords from the provided Excel file
|
| 85 |
+
def read_keywords(file_path):
|
| 86 |
+
df = pd.read_excel(file_path)
|
| 87 |
+
|
| 88 |
+
attributes = df.columns.tolist()
|
| 89 |
+
keywords = {}
|
| 90 |
+
|
| 91 |
+
for attribute in attributes:
|
| 92 |
+
keywords[attribute] = df[attribute].dropna().tolist()
|
| 93 |
+
|
| 94 |
+
return keywords, attributes
|
| 95 |
+
|
| 96 |
+
# Match keywords with sentences
|
| 97 |
+
def match_keywords_in_sentences(sentences, keywords):
|
| 98 |
+
matched_keywords = {attribute: [] for attribute in keywords}
|
| 99 |
+
for sentence in sentences:
|
| 100 |
+
for attribute, sub_keywords in keywords.items():
|
| 101 |
+
for keyword in sub_keywords:
|
| 102 |
+
if keyword.lower() in sentence.lower():
|
| 103 |
+
matched_keywords[attribute].append(sentence)
|
| 104 |
+
return matched_keywords
|