temo12 commited on
Commit
3f160eb
·
verified ·
1 Parent(s): fcecf5e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +213 -0
app.py CHANGED
@@ -1,3 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
  def process_keywords_and_video(url, excel_file):
 
1
+ !pip install spacy vaderSentiment youtube-transcript-api gradio pandas fpdf openpyxl google-api-python-client wordcloud matplotlib
2
+ !python -m spacy download en_core_web_sm
3
+
4
+
5
+ import spacy
6
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
7
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
8
+ from googleapiclient.discovery import build
9
+ from fpdf import FPDF
10
+ import pandas as pd
11
+ import re
12
+ from wordcloud import WordCloud
13
+ import matplotlib.pyplot as plt
14
+
15
+ # Initialize Spacy and VADER
16
+ nlp = spacy.load("en_core_web_sm")
17
+ sia = SentimentIntensityAnalyzer()
18
+
19
+ # YouTube Data API key
20
+ YOUTUBE_API_KEY = "AIzaSyBlI0XNuRAlG7WF3wlsiD5cUkIw7cmhER4"
21
+
22
+
23
+ def fetch_video_metadata(video_url):
24
+ video_id = video_url.split('v=')[-1]
25
+
26
+ youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
27
+
28
+ try:
29
+ request = youtube.videos().list(part="snippet,statistics", id=video_id)
30
+ response = request.execute()
31
+
32
+ video_data = response['items'][0]
33
+
34
+ metadata = {
35
+ "channel_name": video_data['snippet']['channelTitle'],
36
+ "video_title": video_data['snippet']['title'],
37
+ "views": video_data['statistics']['viewCount'],
38
+ "likes": video_data['statistics'].get('likeCount', 'N/A'),
39
+ "dislikes": video_data['statistics'].get('dislikeCount', 'N/A'),
40
+ "posted_date": video_data['snippet']['publishedAt']
41
+ }
42
+
43
+ return metadata, None
44
+
45
+ except VideoUnavailable:
46
+ return None, "Video is unavailable."
47
+ except Exception as e:
48
+ return None, str(e)
49
+
50
+
51
+ def fetch_transcript(video_url):
52
+ video_id = video_url.split('v=')[-1]
53
+
54
+ try:
55
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
56
+ text = " ".join([t['text'] for t in transcript])
57
+ return text, None
58
+
59
+ except (TranscriptsDisabled, VideoUnavailable):
60
+ return None, "Transcript not available for this video."
61
+ except Exception as e:
62
+ return None, str(e)
63
+
64
+
65
+
66
+ def split_long_sentences(text):
67
+ doc = nlp(text) # Tokenize into sentences using Spacy
68
+ sentences = []
69
+
70
+ for sent in doc.sents:
71
+ if len(sent.text.split()) > 25:
72
+ sub_sentences = []
73
+ current_chunk = []
74
+ for token in sent:
75
+ current_chunk.append(token.text)
76
+ if token.is_punct and token.text in {".", "!", "?"}:
77
+ sub_sentences.append(" ".join(current_chunk).strip())
78
+ current_chunk = []
79
+ elif token.text.lower() in {"and", "but", "because", "so"}:
80
+ if len(current_chunk) > 3:
81
+ sub_sentences.append(" ".join(current_chunk).strip())
82
+ current_chunk = []
83
+
84
+ if current_chunk:
85
+ sub_sentences.append(" ".join(current_chunk).strip())
86
+
87
+ sentences.extend(sub_sentences)
88
+ else:
89
+ sentences.append(sent.text.strip())
90
+
91
+ return sentences
92
+
93
+ def read_keywords(file_path):
94
+ df = pd.read_excel(file_path)
95
+
96
+ attributes = df.columns.tolist()
97
+ keywords = {}
98
+
99
+ for attribute in attributes:
100
+ keywords[attribute] = df[attribute].dropna().tolist()
101
+
102
+ return keywords, attributes
103
+
104
+
105
+ def match_keywords_in_sentences(sentences, keywords):
106
+ matched_keywords = {attribute: [] for attribute in keywords}
107
+
108
+ for sentence in sentences:
109
+ for attribute, sub_keywords in keywords.items():
110
+ for keyword in sub_keywords:
111
+ if keyword.lower() in sentence.lower():
112
+ matched_keywords[attribute].append(sentence)
113
+
114
+ return matched_keywords
115
+
116
+
117
+ def analyze_sentiment_for_keywords(matched_keywords, sentences):
118
+ sentiment_results = {}
119
+
120
+ for attribute, sentences_list in matched_keywords.items():
121
+ positive_lines = []
122
+ negative_lines = []
123
+
124
+ for line in sentences_list:
125
+ sentiment = sia.polarity_scores(line)
126
+ if sentiment['compound'] > 0.05:
127
+ positive_lines.append((line.strip(), sentiment['compound']))
128
+ elif sentiment['compound'] < -0.05:
129
+ negative_lines.append((line.strip(), sentiment['compound']))
130
+
131
+ sentiment_results[attribute] = {
132
+ "positive": positive_lines,
133
+ "negative": negative_lines
134
+ }
135
+
136
+ return sentiment_results
137
+
138
+
139
+ def generate_word_clouds(matched_keywords):
140
+ wordclouds = {}
141
+
142
+ for attribute, sentences_list in matched_keywords.items():
143
+ text = " ".join(sentences_list)
144
+
145
+ wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
146
+ wordclouds[attribute] = wordcloud
147
+
148
+ plt.figure(figsize=(10, 5))
149
+ plt.imshow(wordcloud, interpolation='bilinear')
150
+ plt.axis("off")
151
+ plt.title(f"Word Cloud for {attribute}")
152
+ plt.show()
153
+
154
+ return wordclouds
155
+
156
+
157
+
158
+ def generate_pdf_with_sections(metadata, sentiment_results, wordclouds, output_file="Analysis_Report.pdf"):
159
+ pdf = FPDF()
160
+ pdf.set_auto_page_break(auto=True, margin=15)
161
+ pdf.add_page()
162
+ pdf.set_font("Arial", size=12)
163
+
164
+ # Add Metadata
165
+ pdf.set_font("Arial", "B", 16)
166
+ pdf.cell(200, 10, "Auto-Insight: YouTube Video Sentiment & Attribute Analysis Report", ln=True, align="C")
167
+ pdf.ln(10)
168
+
169
+ if metadata:
170
+ pdf.set_font("Arial", "B", 14)
171
+ pdf.cell(0, 10, "Video Metadata", ln=True)
172
+ pdf.set_font("Arial", size=12)
173
+ for key, value in metadata.items():
174
+ pdf.cell(0, 10, f"{key.replace('_', ' ').title()}: {value}", ln=True)
175
+ pdf.ln(10)
176
+
177
+ # Add Sections for Each Attribute
178
+ for attribute, sentiments in sentiment_results.items():
179
+ pdf.add_page()
180
+ pdf.set_font("Arial", "B", 14)
181
+ pdf.cell(0, 10, f"Attribute: {attribute}", ln=True)
182
+ pdf.ln(5)
183
+
184
+ # Add Positive Sentiments
185
+ pdf.set_font("Arial", "B", 12)
186
+ pdf.cell(0, 10, "Positive Sentiments:", ln=True)
187
+ pdf.set_font("Arial", size=12)
188
+ for line, score in sentiments["positive"]:
189
+ pdf.multi_cell(0, 10, f"Line: {line}\nScore: {score}")
190
+ pdf.ln(2)
191
+
192
+ # Add Negative Sentiments
193
+ pdf.set_font("Arial", "B", 12)
194
+ pdf.cell(0, 10, "Negative Sentiments:", ln=True)
195
+ pdf.set_font("Arial", size=12)
196
+ for line, score in sentiments["negative"]:
197
+ pdf.multi_cell(0, 10, f"Line: {line}\nScore: {score}")
198
+ pdf.ln(2)
199
+
200
+ # Add Word Cloud
201
+ if attribute in wordclouds:
202
+ plt.imshow(wordclouds[attribute], interpolation='bilinear')
203
+ plt.axis("off")
204
+ plt.savefig(f"{attribute}_wordcloud.png")
205
+ pdf.image(f"{attribute}_wordcloud.png", x=10, y=80, w=180)
206
+ plt.close()
207
+
208
+ pdf.output(output_file)
209
+ return output_file
210
+
211
+
212
+
213
+
214
  import gradio as gr
215
 
216
  def process_keywords_and_video(url, excel_file):