Spaces:

temo12
/

Auto-Insight

Sleeping

App Files Files Community

temo12 commited on Dec 31, 2024

Commit

684b28f

verified ·

1 Parent(s): a52005e

Update backend.py

Browse files

Files changed (1) hide show

backend.py +54 -23

backend.py CHANGED Viewed

@@ -1,14 +1,11 @@
-# backend.py
 import spacy
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
 from googleapiclient.discovery import build
-import pandas as pd
-from wordcloud import WordCloud
-import matplotlib.pyplot as plt
 import re
 # Initialize Spacy and VADER
 nlp = spacy.load("en_core_web_sm")
@@ -17,16 +14,16 @@ sia = SentimentIntensityAnalyzer()
 # YouTube Data API key
 YOUTUBE_API_KEY = "AIzaSyDUVh0epMGyeAFwaGl2v58tqlwcsIXzAcU"
-# Fetch metadata of YouTube Video
 def fetch_video_metadata(video_url):
     video_id = video_url.split('v=')[-1]
     youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
     try:
         request = youtube.videos().list(part="snippet,statistics", id=video_id)
         response = request.execute()
         video_data = response['items'][0]
         metadata = {
             "channel_name": video_data['snippet']['channelTitle'],
             "video_title": video_data['snippet']['title'],
@@ -37,28 +34,28 @@ def fetch_video_metadata(video_url):
         }
         return metadata, None
     except VideoUnavailable:
         return None, "Video is unavailable."
     except Exception as e:
         return None, str(e)
-# Fetch the transcript for YouTube Video
 def fetch_transcript(video_url):
     video_id = video_url.split('v=')[-1]
     try:
         transcript = YouTubeTranscriptApi.get_transcript(video_id)
         text = " ".join([t['text'] for t in transcript])
         return text, None
     except (TranscriptsDisabled, VideoUnavailable):
         return None, "Transcript not available for this video."
     except Exception as e:
         return None, str(e)
-# Split long sentences into chunks for better processing
 def split_long_sentences(text):
-    doc = nlp(text)  # Tokenize into sentences using Spacy
     sentences = []
     for sent in doc.sents:
         if len(sent.text.split()) > 25:
             sub_sentences = []
@@ -75,25 +72,19 @@ def split_long_sentences(text):
             if current_chunk:
                 sub_sentences.append(" ".join(current_chunk).strip())
             sentences.extend(sub_sentences)
         else:
             sentences.append(sent.text.strip())
     return sentences
-# Read the keywords from the provided Excel file
 def read_keywords(file_path):
-    df = pd.read_excel(file_path)
     attributes = df.columns.tolist()
-    keywords = {}
-    for attribute in attributes:
-        keywords[attribute] = df[attribute].dropna().tolist()
     return keywords, attributes
-# Match keywords with sentences
 def match_keywords_in_sentences(sentences, keywords):
     matched_keywords = {attribute: [] for attribute in keywords}
     for sentence in sentences:
@@ -101,4 +92,44 @@ def match_keywords_in_sentences(sentences, keywords):
             for keyword in sub_keywords:
                 if keyword.lower() in sentence.lower():
                     matched_keywords[attribute].append(sentence)
-    return matched_keywords

 import spacy
+import pandas as pd
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
 from googleapiclient.discovery import build
+from fpdf import FPDF
 import re
+from wordcloud import WordCloud
 # Initialize Spacy and VADER
 nlp = spacy.load("en_core_web_sm")
 # YouTube Data API key
 YOUTUBE_API_KEY = "AIzaSyDUVh0epMGyeAFwaGl2v58tqlwcsIXzAcU"
 def fetch_video_metadata(video_url):
     video_id = video_url.split('v=')[-1]
     youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
     try:
         request = youtube.videos().list(part="snippet,statistics", id=video_id)
         response = request.execute()
         video_data = response['items'][0]
         metadata = {
             "channel_name": video_data['snippet']['channelTitle'],
             "video_title": video_data['snippet']['title'],
         }
         return metadata, None
     except VideoUnavailable:
         return None, "Video is unavailable."
     except Exception as e:
         return None, str(e)
 def fetch_transcript(video_url):
     video_id = video_url.split('v=')[-1]
     try:
         transcript = YouTubeTranscriptApi.get_transcript(video_id)
         text = " ".join([t['text'] for t in transcript])
         return text, None
     except (TranscriptsDisabled, VideoUnavailable):
         return None, "Transcript not available for this video."
     except Exception as e:
         return None, str(e)
 def split_long_sentences(text):
+    doc = nlp(text)
     sentences = []
     for sent in doc.sents:
         if len(sent.text.split()) > 25:
             sub_sentences = []
             if current_chunk:
                 sub_sentences.append(" ".join(current_chunk).strip())
             sentences.extend(sub_sentences)
         else:
             sentences.append(sent.text.strip())
     return sentences
 def read_keywords(file_path):
+    df = pd.read_excel(file_path.name)  # Use file_path.name since it's a Gradio file object
     attributes = df.columns.tolist()
+    keywords = {attribute: df[attribute].dropna().tolist() for attribute in attributes}
     return keywords, attributes
 def match_keywords_in_sentences(sentences, keywords):
     matched_keywords = {attribute: [] for attribute in keywords}
     for sentence in sentences:
             for keyword in sub_keywords:
                 if keyword.lower() in sentence.lower():
                     matched_keywords[attribute].append(sentence)
+    return matched_keywords
+def analyze_sentiment_for_keywords(matched_keywords, sentences):
+    sentiment_results = {attribute: [] for attribute in matched_keywords}
+    for attribute, matched_sentences in matched_keywords.items():
+        for sentence in matched_sentences:
+            sentiment_score = sia.polarity_scores(sentence)["compound"]
+            sentiment_results[attribute].append({"sentence": sentence, "score": sentiment_score})
+    return sentiment_results
+def generate_word_clouds(matched_keywords):
+    wordclouds = {attribute: WordCloud().generate(" ".join(sentences)) for attribute, sentences in matched_keywords.items()}
+    return wordclouds
+def generate_pdf_with_sections(metadata, sentiment_results, wordclouds):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Arial", size=12)
+    # Add metadata to PDF
+    pdf.cell(200, 10, txt=f"Video Title: {metadata['video_title']}", ln=True)
+    pdf.cell(200, 10, txt=f"Channel: {metadata['channel_name']}", ln=True)
+    pdf.cell(200, 10, txt=f"Posted Date: {metadata['posted_date']}", ln=True)
+    pdf.cell(200, 10, txt=f"Views: {metadata['views']}", ln=True)
+    # Add Sentiment Analysis Results
+    for attribute, sentiments in sentiment_results.items():
+        pdf.cell(200, 10, txt=f"\nSentiments for {attribute}:", ln=True)
+        for sentiment in sentiments:
+            pdf.cell(200, 10, txt=f"  - {sentiment['sentence']} [Score: {sentiment['score']}]", ln=True)
+    # Generate Wordclouds
+    for attribute, wordcloud in wordclouds.items():
+        wordcloud_image_path = f"{attribute}_wordcloud.png"
+        wordcloud.to_file(wordcloud_image_path)
+        pdf.add_page()
+        pdf.image(wordcloud_image_path, x=10, y=10, w=180)
+    output_pdf_path = "sentiment_report.pdf"
+    pdf.output(output_pdf_path)
+    return output_pdf_path