temo12 commited on
Commit
8946ed9
·
verified ·
1 Parent(s): cd097b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -228
app.py CHANGED
@@ -1,234 +1,6 @@
1
 
2
 
3
 
4
- import spacy
5
- from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
6
- from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
7
- from googleapiclient.discovery import build
8
- from fpdf import FPDF
9
- import pandas as pd
10
- import re
11
- from wordcloud import WordCloud
12
- import matplotlib.pyplot as plt
13
-
14
- # Initialize Spacy and VADER
15
- nlp = spacy.load("en_core_web_sm")
16
- sia = SentimentIntensityAnalyzer()
17
-
18
- # YouTube Data API key
19
- YOUTUBE_API_KEY = "AIzaSyBlI0XNuRAlG7WF3wlsiD5cUkIw7cmhER4"
20
-
21
-
22
- def fetch_video_metadata(video_url):
23
- video_id = video_url.split('v=')[-1]
24
-
25
- youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
26
-
27
- try:
28
- request = youtube.videos().list(part="snippet,statistics", id=video_id)
29
- response = request.execute()
30
-
31
- video_data = response['items'][0]
32
-
33
- metadata = {
34
- "channel_name": video_data['snippet']['channelTitle'],
35
- "video_title": video_data['snippet']['title'],
36
- "views": video_data['statistics']['viewCount'],
37
- "likes": video_data['statistics'].get('likeCount', 'N/A'),
38
- "dislikes": video_data['statistics'].get('dislikeCount', 'N/A'),
39
- "posted_date": video_data['snippet']['publishedAt']
40
- }
41
-
42
- return metadata, None
43
-
44
- except VideoUnavailable:
45
- return None, "Video is unavailable."
46
- except Exception as e:
47
- return None, str(e)
48
-
49
-
50
- def fetch_transcript(video_url):
51
- video_id = video_url.split('v=')[-1]
52
-
53
- try:
54
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
55
- text = " ".join([t['text'] for t in transcript])
56
- return text, None
57
-
58
- except (TranscriptsDisabled, VideoUnavailable):
59
- return None, "Transcript not available for this video."
60
- except Exception as e:
61
- return None, str(e)
62
-
63
-
64
-
65
- def split_long_sentences(text):
66
- doc = nlp(text) # Tokenize into sentences using Spacy
67
- sentences = []
68
-
69
- for sent in doc.sents:
70
- if len(sent.text.split()) > 25:
71
- sub_sentences = []
72
- current_chunk = []
73
- for token in sent:
74
- current_chunk.append(token.text)
75
- if token.is_punct and token.text in {".", "!", "?"}:
76
- sub_sentences.append(" ".join(current_chunk).strip())
77
- current_chunk = []
78
- elif token.text.lower() in {"and", "but", "because", "so"}:
79
- if len(current_chunk) > 3:
80
- sub_sentences.append(" ".join(current_chunk).strip())
81
- current_chunk = []
82
-
83
- if current_chunk:
84
- sub_sentences.append(" ".join(current_chunk).strip())
85
-
86
- sentences.extend(sub_sentences)
87
- else:
88
- sentences.append(sent.text.strip())
89
-
90
- return sentences
91
-
92
- def read_keywords(file_path):
93
- df = pd.read_excel(file_path)
94
-
95
- attributes = df.columns.tolist()
96
- keywords = {}
97
-
98
- for attribute in attributes:
99
- keywords[attribute] = df[attribute].dropna().tolist()
100
-
101
- return keywords, attributes
102
-
103
-
104
- def match_keywords_in_sentences(sentences, keywords):
105
- matched_keywords = {attribute: [] for attribute in keywords}
106
-
107
- for sentence in sentences:
108
- for attribute, sub_keywords in keywords.items():
109
- for keyword in sub_keywords:
110
- if keyword.lower() in sentence.lower():
111
- matched_keywords[attribute].append(sentence)
112
-
113
- return matched_keywords
114
-
115
-
116
- def analyze_sentiment_for_keywords(matched_keywords, sentences):
117
- sentiment_results = {}
118
-
119
- for attribute, sentences_list in matched_keywords.items():
120
- positive_lines = []
121
- negative_lines = []
122
-
123
- for line in sentences_list:
124
- sentiment = sia.polarity_scores(line)
125
- if sentiment['compound'] > 0.05:
126
- positive_lines.append((line.strip(), sentiment['compound']))
127
- elif sentiment['compound'] < -0.05:
128
- negative_lines.append((line.strip(), sentiment['compound']))
129
-
130
- sentiment_results[attribute] = {
131
- "positive": positive_lines,
132
- "negative": negative_lines
133
- }
134
-
135
- return sentiment_results
136
-
137
-
138
- def generate_word_clouds(matched_keywords):
139
- wordclouds = {}
140
-
141
- for attribute, sentences_list in matched_keywords.items():
142
- text = " ".join(sentences_list)
143
-
144
- wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
145
- wordclouds[attribute] = wordcloud
146
-
147
- plt.figure(figsize=(10, 5))
148
- plt.imshow(wordcloud, interpolation='bilinear')
149
- plt.axis("off")
150
- plt.title(f"Word Cloud for {attribute}")
151
- plt.show()
152
-
153
- return wordclouds
154
-
155
-
156
-
157
- def generate_pdf_with_sections(metadata, sentiment_results, wordclouds, output_file="Analysis_Report.pdf"):
158
- pdf = FPDF()
159
- pdf.set_auto_page_break(auto=True, margin=15)
160
- pdf.add_page()
161
- pdf.set_font("Arial", size=12)
162
-
163
- # Add Metadata
164
- pdf.set_font("Arial", "B", 16)
165
- pdf.cell(200, 10, "Auto-Insight: YouTube Video Sentiment & Attribute Analysis Report", ln=True, align="C")
166
- pdf.ln(10)
167
-
168
- if metadata:
169
- pdf.set_font("Arial", "B", 14)
170
- pdf.cell(0, 10, "Video Metadata", ln=True)
171
- pdf.set_font("Arial", size=12)
172
- for key, value in metadata.items():
173
- pdf.cell(0, 10, f"{key.replace('_', ' ').title()}: {value}", ln=True)
174
- pdf.ln(10)
175
-
176
- # Add Sections for Each Attribute
177
- for attribute, sentiments in sentiment_results.items():
178
- pdf.add_page()
179
- pdf.set_font("Arial", "B", 14)
180
- pdf.cell(0, 10, f"Attribute: {attribute}", ln=True)
181
- pdf.ln(5)
182
-
183
- # Add Positive Sentiments
184
- pdf.set_font("Arial", "B", 12)
185
- pdf.cell(0, 10, "Positive Sentiments:", ln=True)
186
- pdf.set_font("Arial", size=12)
187
- for line, score in sentiments["positive"]:
188
- pdf.multi_cell(0, 10, f"Line: {line}\nScore: {score}")
189
- pdf.ln(2)
190
-
191
- # Add Negative Sentiments
192
- pdf.set_font("Arial", "B", 12)
193
- pdf.cell(0, 10, "Negative Sentiments:", ln=True)
194
- pdf.set_font("Arial", size=12)
195
- for line, score in sentiments["negative"]:
196
- pdf.multi_cell(0, 10, f"Line: {line}\nScore: {score}")
197
- pdf.ln(2)
198
-
199
- # Add Word Cloud
200
- if attribute in wordclouds:
201
- plt.imshow(wordclouds[attribute], interpolation='bilinear')
202
- plt.axis("off")
203
- plt.savefig(f"{attribute}_wordcloud.png")
204
- pdf.image(f"{attribute}_wordcloud.png", x=10, y=80, w=180)
205
- plt.close()
206
-
207
- pdf.output(output_file)
208
- return output_file
209
-
210
-
211
-
212
-
213
- import gradio as gr
214
-
215
- def process_keywords_and_video(url, excel_file):
216
- metadata, error = fetch_video_metadata(url)
217
- if error:
218
- return error, None
219
-
220
- transcript, error = fetch_transcript(url)
221
- if error:
222
- return error, None
223
-
224
- sentences = split_long_sentences(transcript)
225
- keywords, attributes = read_keywords(excel_file)
226
- matched_keywords = match_keywords_in_sentences(sentences, keywords)
227
- sentiment_results = analyze_sentiment_for_keywords(matched_keywords, sentences)
228
- wordclouds = generate_word_clouds(matched_keywords)
229
- pdf_file = generate_pdf_with_sections(metadata, sentiment_results, wordclouds)
230
-
231
- return "Processing completed successfully!", pdf_file
232
 
233
  # Gradio App
234
  with gr.Blocks() as iface:
 
1
 
2
 
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  # Gradio App
6
  with gr.Blocks() as iface: