Spaces:
Sleeping
Sleeping
| # backend.py | |
| import spacy | |
| from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
| from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable | |
| from googleapiclient.discovery import build | |
| import pandas as pd | |
| from wordcloud import WordCloud | |
| import matplotlib.pyplot as plt | |
| import re | |
| # Initialize Spacy and VADER | |
| nlp = spacy.load("en_core_web_sm") | |
| sia = SentimentIntensityAnalyzer() | |
| # YouTube Data API key | |
| YOUTUBE_API_KEY = "AIzaSyDUVh0epMGyeAFwaGl2v58tqlwcsIXzAcU" | |
| # Fetch metadata of YouTube Video | |
| def fetch_video_metadata(video_url): | |
| video_id = video_url.split('v=')[-1] | |
| youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY) | |
| try: | |
| request = youtube.videos().list(part="snippet,statistics", id=video_id) | |
| response = request.execute() | |
| video_data = response['items'][0] | |
| metadata = { | |
| "channel_name": video_data['snippet']['channelTitle'], | |
| "video_title": video_data['snippet']['title'], | |
| "views": video_data['statistics']['viewCount'], | |
| "likes": video_data['statistics'].get('likeCount', 'N/A'), | |
| "dislikes": video_data['statistics'].get('dislikeCount', 'N/A'), | |
| "posted_date": video_data['snippet']['publishedAt'] | |
| } | |
| return metadata, None | |
| except VideoUnavailable: | |
| return None, "Video is unavailable." | |
| except Exception as e: | |
| return None, str(e) | |
| # Fetch the transcript for YouTube Video | |
| def fetch_transcript(video_url): | |
| video_id = video_url.split('v=')[-1] | |
| try: | |
| transcript = YouTubeTranscriptApi.get_transcript(video_id) | |
| text = " ".join([t['text'] for t in transcript]) | |
| return text, None | |
| except (TranscriptsDisabled, VideoUnavailable): | |
| return None, "Transcript not available for this video." | |
| except Exception as e: | |
| return None, str(e) | |
| # Split long sentences into chunks for better processing | |
| def split_long_sentences(text): | |
| doc = nlp(text) # Tokenize into sentences using Spacy | |
| sentences = [] | |
| for sent in doc.sents: | |
| if len(sent.text.split()) > 25: | |
| sub_sentences = [] | |
| current_chunk = [] | |
| for token in sent: | |
| current_chunk.append(token.text) | |
| if token.is_punct and token.text in {".", "!", "?"}: | |
| sub_sentences.append(" ".join(current_chunk).strip()) | |
| current_chunk = [] | |
| elif token.text.lower() in {"and", "but", "because", "so"}: | |
| if len(current_chunk) > 3: | |
| sub_sentences.append(" ".join(current_chunk).strip()) | |
| current_chunk = [] | |
| if current_chunk: | |
| sub_sentences.append(" ".join(current_chunk).strip()) | |
| sentences.extend(sub_sentences) | |
| else: | |
| sentences.append(sent.text.strip()) | |
| return sentences | |
| # Read the keywords from the provided Excel file | |
| def read_keywords(file_path): | |
| df = pd.read_excel(file_path) | |
| attributes = df.columns.tolist() | |
| keywords = {} | |
| for attribute in attributes: | |
| keywords[attribute] = df[attribute].dropna().tolist() | |
| return keywords, attributes | |
| # Match keywords with sentences | |
| def match_keywords_in_sentences(sentences, keywords): | |
| matched_keywords = {attribute: [] for attribute in keywords} | |
| for sentence in sentences: | |
| for attribute, sub_keywords in keywords.items(): | |
| for keyword in sub_keywords: | |
| if keyword.lower() in sentence.lower(): | |
| matched_keywords[attribute].append(sentence) | |
| return matched_keywords |