Ankur Mahanta commited on
Commit
bfc3b02
·
1 Parent(s): 80c7b6d

Initial Wave app deployment

Browse files
Files changed (3) hide show
  1. Dockerfile +13 -0
  2. app.py +546 -0
  3. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ COPY . .
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
8
+
9
+ # Change the port number of our Wave app to 7860 (Hugging Face Spaces default)
10
+ ENV H2O_WAVE_LISTEN=":7860"
11
+ ENV H2O_WAVE_ADDRESS='http://127.0.0.1:7860'
12
+
13
+ CMD ["wave", "run", "app", "--no-reload"]
app.py ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #chat is working + key topics working
2
+ from h2o_wave import main, app, Q, ui, data
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+ from h2ogpte import H2OGPTE
5
+ import re
6
+ import os
7
+ from dotenv import load_dotenv
8
+ from collections import Counter
9
+ import nltk
10
+ from nltk.tokenize import word_tokenize
11
+ from nltk.corpus import stopwords
12
+ from nltk.sentiment import SentimentIntensityAnalyzer
13
+ from textblob import TextBlob
14
+ from nltk.tokenize import sent_tokenize
15
+ import asyncio
16
+ import logging
17
+
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format='%(asctime)s - %(levelname)s - %(message)s'
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+ class TranscriptAnalyzer:
25
+ def __init__(self, transcript_list):
26
+ # Join all text for general analysis
27
+ self.transcript = ' '.join([entry['text'] for entry in transcript_list])
28
+ self.transcript_list = transcript_list
29
+ self.sia = SentimentIntensityAnalyzer()
30
+ self.stop_words = set(stopwords.words('english'))
31
+ self.additional_stops = {'um', 'uh', 'like', 'okay', 'right', 'well', 'so'}
32
+ self.stop_words.update(self.additional_stops)
33
+
34
+ self.sentences = sent_tokenize(self.transcript)
35
+ self.words = word_tokenize(self.transcript.lower())
36
+
37
+ def analyze(self):
38
+ try:
39
+ return {
40
+ 'word_freq': self._analyze_word_frequency(),
41
+ 'sentiments': self._analyze_sentiment(),
42
+ 'topics': self._extract_topics(),
43
+ 'time_segments': self._create_time_segments()
44
+ }
45
+ except Exception as e:
46
+ logger.error(f"Error in transcript analysis: {e}")
47
+ return {
48
+ 'word_freq': [('no data', 1)],
49
+ 'sentiments': [0.0],
50
+ 'topics': [('no topics', 1, ['none'])],
51
+ 'time_segments': [{
52
+ 'start_time': '0:00',
53
+ 'end_time': '0:00',
54
+ 'text': 'Analysis not available',
55
+ 'sentiment': 0.0,
56
+ 'topics': []
57
+ }]
58
+ }
59
+
60
+ def _analyze_word_frequency(self):
61
+ words = [word.strip('.,!?()[]{}":;') for word in self.words]
62
+ words = [word for word in words if (
63
+ word.isalnum() and
64
+ not word.isnumeric() and
65
+ len(word) > 2 and
66
+ word not in self.stop_words
67
+ )]
68
+ return Counter(words).most_common(15)
69
+
70
+ def _analyze_sentiment(self):
71
+ """Analyze sentiment"""
72
+ return [self.sia.polarity_scores(sentence)['compound'] for sentence in self.sentences]
73
+
74
+ def _extract_topics(self):
75
+ blob = TextBlob(self.transcript)
76
+ noun_phrases = [phrase for phrase in blob.noun_phrases if len(phrase.split()) >= 2]
77
+ topic_counter = Counter(noun_phrases)
78
+
79
+ topics = []
80
+ for topic, count in topic_counter.most_common(10):
81
+ topics.append((topic, count, ['related']))
82
+ return topics
83
+
84
+ def _create_time_segments(self):
85
+ """Create time-based segments using actual YouTube timestamps"""
86
+ segments = []
87
+ segment_size = 5 # Number of transcript entries per segment
88
+
89
+ for i in range(0, len(self.transcript_list), segment_size):
90
+ segment_entries = self.transcript_list[i:i + segment_size]
91
+ segment_text = ' '.join([entry['text'] for entry in segment_entries])
92
+
93
+ # Get start time from first entry and end time from last entry
94
+ start_time = segment_entries[0]['start']
95
+ end_time = segment_entries[-1]['start'] + segment_entries[-1]['duration']
96
+
97
+ # Convert times to minutes:seconds format
98
+ start_min, start_sec = divmod(int(start_time), 60)
99
+ end_min, end_sec = divmod(int(end_time), 60)
100
+
101
+ sentiment_scores = self.sia.polarity_scores(segment_text)
102
+ blob = TextBlob(segment_text)
103
+
104
+ segments.append({
105
+ 'start_time': f"{start_min}:{start_sec:02d}",
106
+ 'end_time': f"{end_min}:{end_sec:02d}",
107
+ 'text': segment_text,
108
+ 'sentiment': sentiment_scores['compound'],
109
+ 'topics': [phrase for phrase in blob.noun_phrases][:3]
110
+ })
111
+
112
+ return segments
113
+
114
+
115
+ # Download NLTK data safely
116
+ nltk_dependencies = ['punkt', 'stopwords', 'vader_lexicon']
117
+ for dep in nltk_dependencies:
118
+ try:
119
+ nltk.download(dep, quiet=True)
120
+ except Exception as e:
121
+ print(f"Error downloading {dep}: {e}")
122
+
123
+ # Load environment variables
124
+ load_dotenv()
125
+
126
+ # Initialize H2O GPT client
127
+ h2ogpt_url = os.getenv('H2OGPT_URL')
128
+ h2ogpt_api_key = os.getenv('H2OGPT_API_KEY')
129
+
130
+ client = H2OGPTE(
131
+ address=h2ogpt_url,
132
+ api_key=h2ogpt_api_key
133
+ )
134
+
135
+ def analyze_transcript(transcript):
136
+ """Analyze transcript for insights with more sophisticated processing"""
137
+ try:
138
+ # Word frequency analysis - improved
139
+ # Tokenize and clean text more thoroughly
140
+ tokens = word_tokenize(transcript.lower())
141
+ stop_words = set(stopwords.words('english'))
142
+ # Add common transcript-specific words to stop words
143
+ additional_stops = {'um', 'uh', 'like', 'okay', 'right', 'well', 'so', 'and', 'the', 'to', 'of', 'in', 'a', 'is', 'that'}
144
+ stop_words.update(additional_stops)
145
+
146
+ # Filter for meaningful words (longer than 2 characters and not numbers)
147
+ words = [word for word in tokens if (
148
+ word.isalnum() and
149
+ not word.isnumeric() and
150
+ len(word) > 2 and
151
+ word not in stop_words
152
+ )]
153
+
154
+ # Get word frequency for meaningful words
155
+ word_freq = Counter(words).most_common(10)
156
+
157
+ # Enhanced sentiment analysis
158
+ # Break transcript into meaningful chunks
159
+ sentences = [s.strip() for s in re.split('[.!?]', transcript) if len(s.strip()) > 20]
160
+ sentiments = []
161
+ sia = SentimentIntensityAnalyzer()
162
+
163
+ for sentence in sentences:
164
+ sentiment_scores = sia.polarity_scores(sentence)
165
+ # Use compound score for overall sentiment
166
+ sentiments.append(sentiment_scores['compound'])
167
+
168
+ # If we have too many sentences, sample them to get a representative view
169
+ if len(sentiments) > 50:
170
+ step = len(sentiments) // 50
171
+ sentiments = sentiments[::step][:50]
172
+
173
+ # Topic extraction with improved filtering
174
+ blob = TextBlob(transcript)
175
+ noun_phrases = [phrase for phrase in blob.noun_phrases if len(phrase.split()) >= 2] # Only multi-word phrases
176
+ topics = Counter(noun_phrases).most_common(5)
177
+
178
+ # Ensure we have at least some data
179
+ if not word_freq:
180
+ word_freq = [('no', 1)]
181
+ if not sentiments:
182
+ sentiments = [0.0]
183
+ if not topics:
184
+ topics = [('no topics found', 1)]
185
+
186
+ return {
187
+ 'word_freq': word_freq,
188
+ 'sentiments': sentiments,
189
+ 'topics': topics
190
+ }
191
+ except Exception as e:
192
+ print(f"Error in transcript analysis: {e}")
193
+ return {
194
+ 'word_freq': [('error', 1)],
195
+ 'sentiments': [0.0],
196
+ 'topics': [('error', 1)]
197
+ }
198
+
199
+ def create_word_frequency_plot(word_freq):
200
+ """Create word frequency plot components"""
201
+ plot_data = data(
202
+ fields=['word', 'count'],
203
+ rows=[{'word': word, 'count': count} for word, count in word_freq]
204
+ )
205
+ plot = ui.plot([
206
+ ui.mark(
207
+ type='interval',
208
+ x='=word',
209
+ y='=count'
210
+ )
211
+ ])
212
+ return plot_data, plot
213
+
214
+ def create_sentiment_plot(sentiments):
215
+ """Create sentiment plot components"""
216
+ plot_data = data(
217
+ fields=['index', 'sentiment'],
218
+ rows=[{'index': str(i), 'sentiment': score} for i, score in enumerate(sentiments)]
219
+ )
220
+ plot = ui.plot([
221
+ ui.mark(
222
+ type='line',
223
+ x='=index',
224
+ y='=sentiment'
225
+ )
226
+ ])
227
+ return plot_data, plot
228
+
229
+ def extract_video_id(url):
230
+ """Extract YouTube video ID from URL"""
231
+ pattern = r'(?:v=|\/)([\w-]{11})(?:\?|\/|&|$)'
232
+ match = re.search(pattern, url)
233
+ return match.group(1) if match else None
234
+
235
+ async def get_transcript(video_id: str):
236
+ """Get transcript for YouTube video"""
237
+ try:
238
+ transcript_list = await asyncio.get_event_loop().run_in_executor(
239
+ None, YouTubeTranscriptApi.get_transcript, video_id
240
+ )
241
+ return transcript_list
242
+
243
+ except Exception as e:
244
+ print(f"Error fetching transcript: {e}") # Using print instead of logger for simplicity
245
+ return f"Error: {str(e)}"
246
+
247
+ async def setup_h2ogpt_collection(transcript, video_id):
248
+ """Setup H2O GPT collection for the video transcript"""
249
+ try:
250
+ collection_id = client.create_collection(
251
+ name=f'YouTube_Video_{video_id}',
252
+ description='YouTube video transcript for chat interaction'
253
+ )
254
+
255
+ with open(f'transcript_{video_id}.txt', 'w', encoding='utf-8') as f:
256
+ f.write(transcript)
257
+
258
+ with open(f'transcript_{video_id}.txt', 'rb') as f:
259
+ upload_id = client.upload(f'transcript_{video_id}.txt', f)
260
+
261
+ client.ingest_uploads(collection_id, [upload_id])
262
+ os.remove(f'transcript_{video_id}.txt')
263
+
264
+ return collection_id
265
+ except Exception as e:
266
+ return f"Error setting up H2O GPT: {str(e)}"
267
+
268
+ async def get_gpt_response(collection_id, question):
269
+ """Get response from H2O GPT"""
270
+ try:
271
+ chat_session_id = client.create_chat_session(collection_id)
272
+ with client.connect(chat_session_id) as session:
273
+ response = session.query(
274
+ question,
275
+ timeout=60,
276
+ rag_config={"rag_type": "rag"}
277
+ )
278
+ return response.content
279
+ except Exception as e:
280
+ return f"Error getting response: {str(e)}"
281
+
282
+ @app('/chatbot')
283
+ async def serve(q: Q):
284
+ if not q.client.initialized:
285
+ q.client.initialized = True
286
+
287
+ # Header
288
+ q.page['header'] = ui.header_card(
289
+ box='1 1 12 1',
290
+ title='YouTube Video Transcript Chatbot & Analysis | xAmplify',
291
+ subtitle='Enter a YouTube URL to analyse and chat about the video content',
292
+ color='primary'
293
+ )
294
+
295
+ # URL input form
296
+ q.page['url_form'] = ui.form_card(
297
+ box='1 2 12 1',
298
+ items=[
299
+ ui.inline([
300
+ ui.textbox(
301
+ name='video_url',
302
+ # label='YouTube URL',
303
+ placeholder='Enter YouTube video URL...',
304
+ width='800px'
305
+ ),
306
+ ui.button(
307
+ name='submit_url',
308
+ label='Fetch Transcript',
309
+ primary=True
310
+ ),
311
+ ui.button(
312
+ name='clear_chat',
313
+ label='Clear Chat',
314
+ icon='Delete'
315
+ )
316
+ ])
317
+ ]
318
+ )
319
+
320
+ # Status card
321
+ q.page['status'] = ui.form_card(
322
+ box='1 3 12 1',
323
+ items=[
324
+ ui.text('Please enter a YouTube URL to begin.')
325
+ ]
326
+ )
327
+
328
+ # Left column - Transcript and Analysis
329
+ q.page['transcript'] = ui.form_card(
330
+ box='1 4 6 4',
331
+ title='Video Transcript',
332
+ items=[
333
+ ui.text('Transcript will appear here...')
334
+ ]
335
+ )
336
+
337
+ # Initialize plots with dummy data
338
+ q.page['word_freq'] = ui.plot_card(
339
+ box='1 8 3 4',
340
+ title='Word Frequency Analysis',
341
+ # caption='Frequency of significant terms identified in the video content',
342
+ data=data('word count', rows=[('', 0)], pack=True),
343
+ plot=ui.plot([ui.mark(type='interval', x='=word', y='=count')])
344
+ )
345
+
346
+ q.page['sentiment'] = ui.plot_card(
347
+ box='4 8 3 4',
348
+ title='Sentiment Flow',
349
+ # caption='Emotional tone progression throughout the video',
350
+ data=data('index sentiment', rows=[(0, 0)], pack=True),
351
+ plot=ui.plot([ui.mark(type='line', x='=index', y='=sentiment')])
352
+ )
353
+
354
+ # Key topics
355
+ q.page['topics'] = ui.markdown_card(
356
+ box='7 8 6 4',
357
+ title='Key Topics',
358
+ content='Key topics discussed in the video with their frequency of mention',
359
+ )
360
+
361
+ # Right column - Chat interface
362
+ q.page['chat'] = ui.chatbot_card(
363
+ box='7 4 6 4',
364
+ data=data(fields='content from_user', t='list'),
365
+ name='chatbot',
366
+ events=['feedback'],
367
+ placeholder='Type your question here...',
368
+ disabled=True,
369
+ )
370
+
371
+ # Feedback card
372
+ q.page['feedback'] = ui.form_card(
373
+ box='1 12 12 1',
374
+ items=[
375
+ ui.inline([
376
+ ui.text_l('Response Feedback'),
377
+ ui.text(name='feedback_text', content='No feedback yet.'),
378
+ ui.button(name='export_chat', label='Export Chat', icon='Download')
379
+ ])
380
+ ]
381
+ )
382
+
383
+ # Handle URL submission
384
+ if q.args.submit_url:
385
+ url = q.args.video_url
386
+ video_id = extract_video_id(url)
387
+
388
+ if not video_id:
389
+ q.page['status'].items = [
390
+ ui.message_bar(
391
+ type='error',
392
+ text='Invalid YouTube URL. Please check and try again.'
393
+ )
394
+ ]
395
+ return
396
+
397
+ # Update status to processing
398
+ q.page['status'].items = [
399
+ ui.progress(label='Processing video transcript...', value=True)
400
+ ]
401
+ await q.page.save()
402
+
403
+ # Get and process transcript
404
+ transcript_list = await get_transcript(video_id)
405
+ if isinstance(transcript_list, str) and transcript_list.startswith('Error'):
406
+ q.page['status'].items = [
407
+ ui.message_bar(type='error', text=transcript_list)
408
+ ]
409
+ return
410
+
411
+ # Store transcript and analyze
412
+ q.client.transcript = ' '.join([entry['text'] for entry in transcript_list])
413
+ analyzer = TranscriptAnalyzer(transcript_list)
414
+ analysis = analyzer.analyze()
415
+
416
+ # Update transcript display with time segments
417
+ transcript_items = []
418
+ transcript_items.append(ui.text_xl('Video Transcript'))
419
+ transcript_items.append(ui.separator())
420
+
421
+ for segment in analysis['time_segments']:
422
+ # Add timestamp header with markdown for bold text
423
+ transcript_items.append(
424
+ ui.text(
425
+ f"**[{segment['start_time']} - {segment['end_time']}]**",
426
+ size='s'
427
+ )
428
+ )
429
+
430
+ # Add segment text
431
+ transcript_items.append(ui.text(segment['text']))
432
+
433
+ # Add sentiment indicator
434
+ sentiment_value = (segment['sentiment'] + 1) / 2 # Convert from [-1,1] to [0,1]
435
+ transcript_items.append(
436
+ ui.progress(
437
+ label='Sentiment',
438
+ value=sentiment_value,
439
+ caption=f"{'Positive' if segment['sentiment'] > 0.1 else 'Negative' if segment['sentiment'] < -0.1 else 'Neutral'}"
440
+ )
441
+ )
442
+
443
+ # Add segment topics if available
444
+ if segment['topics']:
445
+ transcript_items.append(
446
+ ui.text(
447
+ f"Topics: {', '.join(segment['topics'])}",
448
+ size='s'
449
+ )
450
+ )
451
+
452
+ # Add separator between segments
453
+ transcript_items.append(ui.separator())
454
+
455
+ q.page['transcript'].items = transcript_items
456
+
457
+ # Update analysis visualizations
458
+ word_freq_data = [(word, count) for word, count in analysis['word_freq']]
459
+ q.page['word_freq'].data = data('word count', rows=word_freq_data, pack=True)
460
+
461
+ # Update sentiment plot
462
+ sentiment_data = [(i, score) for i, score in enumerate(analysis['sentiments'])]
463
+ q.page['sentiment'].data = data('index sentiment', rows=sentiment_data, pack=True)
464
+
465
+ # Update topics
466
+ topics_md = "## Key Topics Discussed\n" + "\n".join([
467
+ f"- {topic} ({count} mentions)\n Related: {', '.join(related)}"
468
+ for topic, count, related in analysis['topics']
469
+ ])
470
+ q.page['topics'].content = topics_md
471
+
472
+ # Setup H2O GPT collection
473
+ collection_id = await setup_h2ogpt_collection(q.client.transcript, video_id)
474
+ if isinstance(collection_id, str) and collection_id.startswith('Error'):
475
+ q.page['status'].items = [
476
+ ui.message_bar(type='error', text=collection_id)
477
+ ]
478
+ return
479
+
480
+ # Store collection ID
481
+ q.client.collection_id = collection_id
482
+
483
+ # Enable chat and update status
484
+ q.page['chat'].disabled = False
485
+ q.page['status'].items = [
486
+ ui.message_bar(
487
+ type='success',
488
+ text='Transcript processed successfully! You can now ask questions about the video.'
489
+ )
490
+ ]
491
+
492
+ # Handle chat clear
493
+ if q.args.clear_chat:
494
+ q.page['chat'].data = data(fields='content from_user', t='list')
495
+ q.page['feedback'].items[0][1].content = 'Chat history cleared.'
496
+
497
+ # Handle chat export
498
+ if q.args.export_chat:
499
+ if hasattr(q.client, 'transcript'):
500
+ chat_history = []
501
+ for msg in q.page['chat'].data:
502
+ # Updated to use User: and Response: prefixes
503
+ prefix = "User: " if msg[1] else "Response: "
504
+ chat_history.append(f'{prefix}{msg[0]}')
505
+
506
+ export_content = f'''YouTube Video Transcript Chatbot
507
+ Transcript:
508
+ {q.client.transcript}
509
+
510
+ Chat History:
511
+ {"\n".join(chat_history)}'''
512
+
513
+ q.page['export'] = ui.form_card(
514
+ box='1 13 12 2',
515
+ items=[
516
+ ui.text_area(
517
+ name='export_content',
518
+ label='Chat Export (Copy and save)',
519
+ value=export_content,
520
+ height='200px'
521
+ )
522
+ ]
523
+ )
524
+
525
+ # Handle chat messages
526
+ if q.args.chatbot:
527
+ # Add user message with "User:" prefix
528
+ user_message = f"User: {q.args.chatbot}"
529
+ q.page['chat'].data += [user_message, True]
530
+ await q.page.save()
531
+
532
+ if hasattr(q.client, 'collection_id'):
533
+ response = await get_gpt_response(q.client.collection_id, q.args.chatbot)
534
+ # Add response with "Response:" prefix
535
+ formatted_response = f"Response: {response}"
536
+ q.page['chat'].data += [formatted_response, False]
537
+ else:
538
+ # Add error message with "Response:" prefix
539
+ q.page['chat'].data += ['Response: Please fetch a video transcript first.', False]
540
+
541
+ # Handle feedback
542
+ if q.events.chatbot and q.events.chatbot.feedback:
543
+ feedback = q.events.chatbot.feedback
544
+ q.page['feedback'].items[0][1].content = f'Latest feedback: {feedback.type} on "{feedback.message}"'
545
+
546
+ await q.page.save()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ h2o-wave
2
+ youtube-transcript-api
3
+ python-dotenv
4
+ h2ogpte
5
+ nltk
6
+ textblob