Ankur Mahanta commited on
Commit ·
bfc3b02
1
Parent(s): 80c7b6d
Initial Wave app deployment
Browse files- Dockerfile +13 -0
- app.py +546 -0
- requirements.txt +6 -0
Dockerfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9
|
| 2 |
+
|
| 3 |
+
WORKDIR /code
|
| 4 |
+
|
| 5 |
+
COPY . .
|
| 6 |
+
|
| 7 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 8 |
+
|
| 9 |
+
# Change the port number of our Wave app to 7860 (Hugging Face Spaces default)
|
| 10 |
+
ENV H2O_WAVE_LISTEN=":7860"
|
| 11 |
+
ENV H2O_WAVE_ADDRESS='http://127.0.0.1:7860'
|
| 12 |
+
|
| 13 |
+
CMD ["wave", "run", "app", "--no-reload"]
|
app.py
ADDED
|
@@ -0,0 +1,546 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#chat is working + key topics working
|
| 2 |
+
from h2o_wave import main, app, Q, ui, data
|
| 3 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 4 |
+
from h2ogpte import H2OGPTE
|
| 5 |
+
import re
|
| 6 |
+
import os
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from collections import Counter
|
| 9 |
+
import nltk
|
| 10 |
+
from nltk.tokenize import word_tokenize
|
| 11 |
+
from nltk.corpus import stopwords
|
| 12 |
+
from nltk.sentiment import SentimentIntensityAnalyzer
|
| 13 |
+
from textblob import TextBlob
|
| 14 |
+
from nltk.tokenize import sent_tokenize
|
| 15 |
+
import asyncio
|
| 16 |
+
import logging
|
| 17 |
+
|
| 18 |
+
logging.basicConfig(
|
| 19 |
+
level=logging.INFO,
|
| 20 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
| 21 |
+
)
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
class TranscriptAnalyzer:
|
| 25 |
+
def __init__(self, transcript_list):
|
| 26 |
+
# Join all text for general analysis
|
| 27 |
+
self.transcript = ' '.join([entry['text'] for entry in transcript_list])
|
| 28 |
+
self.transcript_list = transcript_list
|
| 29 |
+
self.sia = SentimentIntensityAnalyzer()
|
| 30 |
+
self.stop_words = set(stopwords.words('english'))
|
| 31 |
+
self.additional_stops = {'um', 'uh', 'like', 'okay', 'right', 'well', 'so'}
|
| 32 |
+
self.stop_words.update(self.additional_stops)
|
| 33 |
+
|
| 34 |
+
self.sentences = sent_tokenize(self.transcript)
|
| 35 |
+
self.words = word_tokenize(self.transcript.lower())
|
| 36 |
+
|
| 37 |
+
def analyze(self):
|
| 38 |
+
try:
|
| 39 |
+
return {
|
| 40 |
+
'word_freq': self._analyze_word_frequency(),
|
| 41 |
+
'sentiments': self._analyze_sentiment(),
|
| 42 |
+
'topics': self._extract_topics(),
|
| 43 |
+
'time_segments': self._create_time_segments()
|
| 44 |
+
}
|
| 45 |
+
except Exception as e:
|
| 46 |
+
logger.error(f"Error in transcript analysis: {e}")
|
| 47 |
+
return {
|
| 48 |
+
'word_freq': [('no data', 1)],
|
| 49 |
+
'sentiments': [0.0],
|
| 50 |
+
'topics': [('no topics', 1, ['none'])],
|
| 51 |
+
'time_segments': [{
|
| 52 |
+
'start_time': '0:00',
|
| 53 |
+
'end_time': '0:00',
|
| 54 |
+
'text': 'Analysis not available',
|
| 55 |
+
'sentiment': 0.0,
|
| 56 |
+
'topics': []
|
| 57 |
+
}]
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
def _analyze_word_frequency(self):
|
| 61 |
+
words = [word.strip('.,!?()[]{}":;') for word in self.words]
|
| 62 |
+
words = [word for word in words if (
|
| 63 |
+
word.isalnum() and
|
| 64 |
+
not word.isnumeric() and
|
| 65 |
+
len(word) > 2 and
|
| 66 |
+
word not in self.stop_words
|
| 67 |
+
)]
|
| 68 |
+
return Counter(words).most_common(15)
|
| 69 |
+
|
| 70 |
+
def _analyze_sentiment(self):
|
| 71 |
+
"""Analyze sentiment"""
|
| 72 |
+
return [self.sia.polarity_scores(sentence)['compound'] for sentence in self.sentences]
|
| 73 |
+
|
| 74 |
+
def _extract_topics(self):
|
| 75 |
+
blob = TextBlob(self.transcript)
|
| 76 |
+
noun_phrases = [phrase for phrase in blob.noun_phrases if len(phrase.split()) >= 2]
|
| 77 |
+
topic_counter = Counter(noun_phrases)
|
| 78 |
+
|
| 79 |
+
topics = []
|
| 80 |
+
for topic, count in topic_counter.most_common(10):
|
| 81 |
+
topics.append((topic, count, ['related']))
|
| 82 |
+
return topics
|
| 83 |
+
|
| 84 |
+
def _create_time_segments(self):
|
| 85 |
+
"""Create time-based segments using actual YouTube timestamps"""
|
| 86 |
+
segments = []
|
| 87 |
+
segment_size = 5 # Number of transcript entries per segment
|
| 88 |
+
|
| 89 |
+
for i in range(0, len(self.transcript_list), segment_size):
|
| 90 |
+
segment_entries = self.transcript_list[i:i + segment_size]
|
| 91 |
+
segment_text = ' '.join([entry['text'] for entry in segment_entries])
|
| 92 |
+
|
| 93 |
+
# Get start time from first entry and end time from last entry
|
| 94 |
+
start_time = segment_entries[0]['start']
|
| 95 |
+
end_time = segment_entries[-1]['start'] + segment_entries[-1]['duration']
|
| 96 |
+
|
| 97 |
+
# Convert times to minutes:seconds format
|
| 98 |
+
start_min, start_sec = divmod(int(start_time), 60)
|
| 99 |
+
end_min, end_sec = divmod(int(end_time), 60)
|
| 100 |
+
|
| 101 |
+
sentiment_scores = self.sia.polarity_scores(segment_text)
|
| 102 |
+
blob = TextBlob(segment_text)
|
| 103 |
+
|
| 104 |
+
segments.append({
|
| 105 |
+
'start_time': f"{start_min}:{start_sec:02d}",
|
| 106 |
+
'end_time': f"{end_min}:{end_sec:02d}",
|
| 107 |
+
'text': segment_text,
|
| 108 |
+
'sentiment': sentiment_scores['compound'],
|
| 109 |
+
'topics': [phrase for phrase in blob.noun_phrases][:3]
|
| 110 |
+
})
|
| 111 |
+
|
| 112 |
+
return segments
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# Download NLTK data safely
|
| 116 |
+
nltk_dependencies = ['punkt', 'stopwords', 'vader_lexicon']
|
| 117 |
+
for dep in nltk_dependencies:
|
| 118 |
+
try:
|
| 119 |
+
nltk.download(dep, quiet=True)
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(f"Error downloading {dep}: {e}")
|
| 122 |
+
|
| 123 |
+
# Load environment variables
|
| 124 |
+
load_dotenv()
|
| 125 |
+
|
| 126 |
+
# Initialize H2O GPT client
|
| 127 |
+
h2ogpt_url = os.getenv('H2OGPT_URL')
|
| 128 |
+
h2ogpt_api_key = os.getenv('H2OGPT_API_KEY')
|
| 129 |
+
|
| 130 |
+
client = H2OGPTE(
|
| 131 |
+
address=h2ogpt_url,
|
| 132 |
+
api_key=h2ogpt_api_key
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
def analyze_transcript(transcript):
|
| 136 |
+
"""Analyze transcript for insights with more sophisticated processing"""
|
| 137 |
+
try:
|
| 138 |
+
# Word frequency analysis - improved
|
| 139 |
+
# Tokenize and clean text more thoroughly
|
| 140 |
+
tokens = word_tokenize(transcript.lower())
|
| 141 |
+
stop_words = set(stopwords.words('english'))
|
| 142 |
+
# Add common transcript-specific words to stop words
|
| 143 |
+
additional_stops = {'um', 'uh', 'like', 'okay', 'right', 'well', 'so', 'and', 'the', 'to', 'of', 'in', 'a', 'is', 'that'}
|
| 144 |
+
stop_words.update(additional_stops)
|
| 145 |
+
|
| 146 |
+
# Filter for meaningful words (longer than 2 characters and not numbers)
|
| 147 |
+
words = [word for word in tokens if (
|
| 148 |
+
word.isalnum() and
|
| 149 |
+
not word.isnumeric() and
|
| 150 |
+
len(word) > 2 and
|
| 151 |
+
word not in stop_words
|
| 152 |
+
)]
|
| 153 |
+
|
| 154 |
+
# Get word frequency for meaningful words
|
| 155 |
+
word_freq = Counter(words).most_common(10)
|
| 156 |
+
|
| 157 |
+
# Enhanced sentiment analysis
|
| 158 |
+
# Break transcript into meaningful chunks
|
| 159 |
+
sentences = [s.strip() for s in re.split('[.!?]', transcript) if len(s.strip()) > 20]
|
| 160 |
+
sentiments = []
|
| 161 |
+
sia = SentimentIntensityAnalyzer()
|
| 162 |
+
|
| 163 |
+
for sentence in sentences:
|
| 164 |
+
sentiment_scores = sia.polarity_scores(sentence)
|
| 165 |
+
# Use compound score for overall sentiment
|
| 166 |
+
sentiments.append(sentiment_scores['compound'])
|
| 167 |
+
|
| 168 |
+
# If we have too many sentences, sample them to get a representative view
|
| 169 |
+
if len(sentiments) > 50:
|
| 170 |
+
step = len(sentiments) // 50
|
| 171 |
+
sentiments = sentiments[::step][:50]
|
| 172 |
+
|
| 173 |
+
# Topic extraction with improved filtering
|
| 174 |
+
blob = TextBlob(transcript)
|
| 175 |
+
noun_phrases = [phrase for phrase in blob.noun_phrases if len(phrase.split()) >= 2] # Only multi-word phrases
|
| 176 |
+
topics = Counter(noun_phrases).most_common(5)
|
| 177 |
+
|
| 178 |
+
# Ensure we have at least some data
|
| 179 |
+
if not word_freq:
|
| 180 |
+
word_freq = [('no', 1)]
|
| 181 |
+
if not sentiments:
|
| 182 |
+
sentiments = [0.0]
|
| 183 |
+
if not topics:
|
| 184 |
+
topics = [('no topics found', 1)]
|
| 185 |
+
|
| 186 |
+
return {
|
| 187 |
+
'word_freq': word_freq,
|
| 188 |
+
'sentiments': sentiments,
|
| 189 |
+
'topics': topics
|
| 190 |
+
}
|
| 191 |
+
except Exception as e:
|
| 192 |
+
print(f"Error in transcript analysis: {e}")
|
| 193 |
+
return {
|
| 194 |
+
'word_freq': [('error', 1)],
|
| 195 |
+
'sentiments': [0.0],
|
| 196 |
+
'topics': [('error', 1)]
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
def create_word_frequency_plot(word_freq):
|
| 200 |
+
"""Create word frequency plot components"""
|
| 201 |
+
plot_data = data(
|
| 202 |
+
fields=['word', 'count'],
|
| 203 |
+
rows=[{'word': word, 'count': count} for word, count in word_freq]
|
| 204 |
+
)
|
| 205 |
+
plot = ui.plot([
|
| 206 |
+
ui.mark(
|
| 207 |
+
type='interval',
|
| 208 |
+
x='=word',
|
| 209 |
+
y='=count'
|
| 210 |
+
)
|
| 211 |
+
])
|
| 212 |
+
return plot_data, plot
|
| 213 |
+
|
| 214 |
+
def create_sentiment_plot(sentiments):
|
| 215 |
+
"""Create sentiment plot components"""
|
| 216 |
+
plot_data = data(
|
| 217 |
+
fields=['index', 'sentiment'],
|
| 218 |
+
rows=[{'index': str(i), 'sentiment': score} for i, score in enumerate(sentiments)]
|
| 219 |
+
)
|
| 220 |
+
plot = ui.plot([
|
| 221 |
+
ui.mark(
|
| 222 |
+
type='line',
|
| 223 |
+
x='=index',
|
| 224 |
+
y='=sentiment'
|
| 225 |
+
)
|
| 226 |
+
])
|
| 227 |
+
return plot_data, plot
|
| 228 |
+
|
| 229 |
+
def extract_video_id(url):
|
| 230 |
+
"""Extract YouTube video ID from URL"""
|
| 231 |
+
pattern = r'(?:v=|\/)([\w-]{11})(?:\?|\/|&|$)'
|
| 232 |
+
match = re.search(pattern, url)
|
| 233 |
+
return match.group(1) if match else None
|
| 234 |
+
|
| 235 |
+
async def get_transcript(video_id: str):
|
| 236 |
+
"""Get transcript for YouTube video"""
|
| 237 |
+
try:
|
| 238 |
+
transcript_list = await asyncio.get_event_loop().run_in_executor(
|
| 239 |
+
None, YouTubeTranscriptApi.get_transcript, video_id
|
| 240 |
+
)
|
| 241 |
+
return transcript_list
|
| 242 |
+
|
| 243 |
+
except Exception as e:
|
| 244 |
+
print(f"Error fetching transcript: {e}") # Using print instead of logger for simplicity
|
| 245 |
+
return f"Error: {str(e)}"
|
| 246 |
+
|
| 247 |
+
async def setup_h2ogpt_collection(transcript, video_id):
|
| 248 |
+
"""Setup H2O GPT collection for the video transcript"""
|
| 249 |
+
try:
|
| 250 |
+
collection_id = client.create_collection(
|
| 251 |
+
name=f'YouTube_Video_{video_id}',
|
| 252 |
+
description='YouTube video transcript for chat interaction'
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
with open(f'transcript_{video_id}.txt', 'w', encoding='utf-8') as f:
|
| 256 |
+
f.write(transcript)
|
| 257 |
+
|
| 258 |
+
with open(f'transcript_{video_id}.txt', 'rb') as f:
|
| 259 |
+
upload_id = client.upload(f'transcript_{video_id}.txt', f)
|
| 260 |
+
|
| 261 |
+
client.ingest_uploads(collection_id, [upload_id])
|
| 262 |
+
os.remove(f'transcript_{video_id}.txt')
|
| 263 |
+
|
| 264 |
+
return collection_id
|
| 265 |
+
except Exception as e:
|
| 266 |
+
return f"Error setting up H2O GPT: {str(e)}"
|
| 267 |
+
|
| 268 |
+
async def get_gpt_response(collection_id, question):
|
| 269 |
+
"""Get response from H2O GPT"""
|
| 270 |
+
try:
|
| 271 |
+
chat_session_id = client.create_chat_session(collection_id)
|
| 272 |
+
with client.connect(chat_session_id) as session:
|
| 273 |
+
response = session.query(
|
| 274 |
+
question,
|
| 275 |
+
timeout=60,
|
| 276 |
+
rag_config={"rag_type": "rag"}
|
| 277 |
+
)
|
| 278 |
+
return response.content
|
| 279 |
+
except Exception as e:
|
| 280 |
+
return f"Error getting response: {str(e)}"
|
| 281 |
+
|
| 282 |
+
@app('/chatbot')
|
| 283 |
+
async def serve(q: Q):
|
| 284 |
+
if not q.client.initialized:
|
| 285 |
+
q.client.initialized = True
|
| 286 |
+
|
| 287 |
+
# Header
|
| 288 |
+
q.page['header'] = ui.header_card(
|
| 289 |
+
box='1 1 12 1',
|
| 290 |
+
title='YouTube Video Transcript Chatbot & Analysis | xAmplify',
|
| 291 |
+
subtitle='Enter a YouTube URL to analyse and chat about the video content',
|
| 292 |
+
color='primary'
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
# URL input form
|
| 296 |
+
q.page['url_form'] = ui.form_card(
|
| 297 |
+
box='1 2 12 1',
|
| 298 |
+
items=[
|
| 299 |
+
ui.inline([
|
| 300 |
+
ui.textbox(
|
| 301 |
+
name='video_url',
|
| 302 |
+
# label='YouTube URL',
|
| 303 |
+
placeholder='Enter YouTube video URL...',
|
| 304 |
+
width='800px'
|
| 305 |
+
),
|
| 306 |
+
ui.button(
|
| 307 |
+
name='submit_url',
|
| 308 |
+
label='Fetch Transcript',
|
| 309 |
+
primary=True
|
| 310 |
+
),
|
| 311 |
+
ui.button(
|
| 312 |
+
name='clear_chat',
|
| 313 |
+
label='Clear Chat',
|
| 314 |
+
icon='Delete'
|
| 315 |
+
)
|
| 316 |
+
])
|
| 317 |
+
]
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
# Status card
|
| 321 |
+
q.page['status'] = ui.form_card(
|
| 322 |
+
box='1 3 12 1',
|
| 323 |
+
items=[
|
| 324 |
+
ui.text('Please enter a YouTube URL to begin.')
|
| 325 |
+
]
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
# Left column - Transcript and Analysis
|
| 329 |
+
q.page['transcript'] = ui.form_card(
|
| 330 |
+
box='1 4 6 4',
|
| 331 |
+
title='Video Transcript',
|
| 332 |
+
items=[
|
| 333 |
+
ui.text('Transcript will appear here...')
|
| 334 |
+
]
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
# Initialize plots with dummy data
|
| 338 |
+
q.page['word_freq'] = ui.plot_card(
|
| 339 |
+
box='1 8 3 4',
|
| 340 |
+
title='Word Frequency Analysis',
|
| 341 |
+
# caption='Frequency of significant terms identified in the video content',
|
| 342 |
+
data=data('word count', rows=[('', 0)], pack=True),
|
| 343 |
+
plot=ui.plot([ui.mark(type='interval', x='=word', y='=count')])
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
q.page['sentiment'] = ui.plot_card(
|
| 347 |
+
box='4 8 3 4',
|
| 348 |
+
title='Sentiment Flow',
|
| 349 |
+
# caption='Emotional tone progression throughout the video',
|
| 350 |
+
data=data('index sentiment', rows=[(0, 0)], pack=True),
|
| 351 |
+
plot=ui.plot([ui.mark(type='line', x='=index', y='=sentiment')])
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
# Key topics
|
| 355 |
+
q.page['topics'] = ui.markdown_card(
|
| 356 |
+
box='7 8 6 4',
|
| 357 |
+
title='Key Topics',
|
| 358 |
+
content='Key topics discussed in the video with their frequency of mention',
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
# Right column - Chat interface
|
| 362 |
+
q.page['chat'] = ui.chatbot_card(
|
| 363 |
+
box='7 4 6 4',
|
| 364 |
+
data=data(fields='content from_user', t='list'),
|
| 365 |
+
name='chatbot',
|
| 366 |
+
events=['feedback'],
|
| 367 |
+
placeholder='Type your question here...',
|
| 368 |
+
disabled=True,
|
| 369 |
+
)
|
| 370 |
+
|
| 371 |
+
# Feedback card
|
| 372 |
+
q.page['feedback'] = ui.form_card(
|
| 373 |
+
box='1 12 12 1',
|
| 374 |
+
items=[
|
| 375 |
+
ui.inline([
|
| 376 |
+
ui.text_l('Response Feedback'),
|
| 377 |
+
ui.text(name='feedback_text', content='No feedback yet.'),
|
| 378 |
+
ui.button(name='export_chat', label='Export Chat', icon='Download')
|
| 379 |
+
])
|
| 380 |
+
]
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
# Handle URL submission
|
| 384 |
+
if q.args.submit_url:
|
| 385 |
+
url = q.args.video_url
|
| 386 |
+
video_id = extract_video_id(url)
|
| 387 |
+
|
| 388 |
+
if not video_id:
|
| 389 |
+
q.page['status'].items = [
|
| 390 |
+
ui.message_bar(
|
| 391 |
+
type='error',
|
| 392 |
+
text='Invalid YouTube URL. Please check and try again.'
|
| 393 |
+
)
|
| 394 |
+
]
|
| 395 |
+
return
|
| 396 |
+
|
| 397 |
+
# Update status to processing
|
| 398 |
+
q.page['status'].items = [
|
| 399 |
+
ui.progress(label='Processing video transcript...', value=True)
|
| 400 |
+
]
|
| 401 |
+
await q.page.save()
|
| 402 |
+
|
| 403 |
+
# Get and process transcript
|
| 404 |
+
transcript_list = await get_transcript(video_id)
|
| 405 |
+
if isinstance(transcript_list, str) and transcript_list.startswith('Error'):
|
| 406 |
+
q.page['status'].items = [
|
| 407 |
+
ui.message_bar(type='error', text=transcript_list)
|
| 408 |
+
]
|
| 409 |
+
return
|
| 410 |
+
|
| 411 |
+
# Store transcript and analyze
|
| 412 |
+
q.client.transcript = ' '.join([entry['text'] for entry in transcript_list])
|
| 413 |
+
analyzer = TranscriptAnalyzer(transcript_list)
|
| 414 |
+
analysis = analyzer.analyze()
|
| 415 |
+
|
| 416 |
+
# Update transcript display with time segments
|
| 417 |
+
transcript_items = []
|
| 418 |
+
transcript_items.append(ui.text_xl('Video Transcript'))
|
| 419 |
+
transcript_items.append(ui.separator())
|
| 420 |
+
|
| 421 |
+
for segment in analysis['time_segments']:
|
| 422 |
+
# Add timestamp header with markdown for bold text
|
| 423 |
+
transcript_items.append(
|
| 424 |
+
ui.text(
|
| 425 |
+
f"**[{segment['start_time']} - {segment['end_time']}]**",
|
| 426 |
+
size='s'
|
| 427 |
+
)
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
# Add segment text
|
| 431 |
+
transcript_items.append(ui.text(segment['text']))
|
| 432 |
+
|
| 433 |
+
# Add sentiment indicator
|
| 434 |
+
sentiment_value = (segment['sentiment'] + 1) / 2 # Convert from [-1,1] to [0,1]
|
| 435 |
+
transcript_items.append(
|
| 436 |
+
ui.progress(
|
| 437 |
+
label='Sentiment',
|
| 438 |
+
value=sentiment_value,
|
| 439 |
+
caption=f"{'Positive' if segment['sentiment'] > 0.1 else 'Negative' if segment['sentiment'] < -0.1 else 'Neutral'}"
|
| 440 |
+
)
|
| 441 |
+
)
|
| 442 |
+
|
| 443 |
+
# Add segment topics if available
|
| 444 |
+
if segment['topics']:
|
| 445 |
+
transcript_items.append(
|
| 446 |
+
ui.text(
|
| 447 |
+
f"Topics: {', '.join(segment['topics'])}",
|
| 448 |
+
size='s'
|
| 449 |
+
)
|
| 450 |
+
)
|
| 451 |
+
|
| 452 |
+
# Add separator between segments
|
| 453 |
+
transcript_items.append(ui.separator())
|
| 454 |
+
|
| 455 |
+
q.page['transcript'].items = transcript_items
|
| 456 |
+
|
| 457 |
+
# Update analysis visualizations
|
| 458 |
+
word_freq_data = [(word, count) for word, count in analysis['word_freq']]
|
| 459 |
+
q.page['word_freq'].data = data('word count', rows=word_freq_data, pack=True)
|
| 460 |
+
|
| 461 |
+
# Update sentiment plot
|
| 462 |
+
sentiment_data = [(i, score) for i, score in enumerate(analysis['sentiments'])]
|
| 463 |
+
q.page['sentiment'].data = data('index sentiment', rows=sentiment_data, pack=True)
|
| 464 |
+
|
| 465 |
+
# Update topics
|
| 466 |
+
topics_md = "## Key Topics Discussed\n" + "\n".join([
|
| 467 |
+
f"- {topic} ({count} mentions)\n Related: {', '.join(related)}"
|
| 468 |
+
for topic, count, related in analysis['topics']
|
| 469 |
+
])
|
| 470 |
+
q.page['topics'].content = topics_md
|
| 471 |
+
|
| 472 |
+
# Setup H2O GPT collection
|
| 473 |
+
collection_id = await setup_h2ogpt_collection(q.client.transcript, video_id)
|
| 474 |
+
if isinstance(collection_id, str) and collection_id.startswith('Error'):
|
| 475 |
+
q.page['status'].items = [
|
| 476 |
+
ui.message_bar(type='error', text=collection_id)
|
| 477 |
+
]
|
| 478 |
+
return
|
| 479 |
+
|
| 480 |
+
# Store collection ID
|
| 481 |
+
q.client.collection_id = collection_id
|
| 482 |
+
|
| 483 |
+
# Enable chat and update status
|
| 484 |
+
q.page['chat'].disabled = False
|
| 485 |
+
q.page['status'].items = [
|
| 486 |
+
ui.message_bar(
|
| 487 |
+
type='success',
|
| 488 |
+
text='Transcript processed successfully! You can now ask questions about the video.'
|
| 489 |
+
)
|
| 490 |
+
]
|
| 491 |
+
|
| 492 |
+
# Handle chat clear
|
| 493 |
+
if q.args.clear_chat:
|
| 494 |
+
q.page['chat'].data = data(fields='content from_user', t='list')
|
| 495 |
+
q.page['feedback'].items[0][1].content = 'Chat history cleared.'
|
| 496 |
+
|
| 497 |
+
# Handle chat export
|
| 498 |
+
if q.args.export_chat:
|
| 499 |
+
if hasattr(q.client, 'transcript'):
|
| 500 |
+
chat_history = []
|
| 501 |
+
for msg in q.page['chat'].data:
|
| 502 |
+
# Updated to use User: and Response: prefixes
|
| 503 |
+
prefix = "User: " if msg[1] else "Response: "
|
| 504 |
+
chat_history.append(f'{prefix}{msg[0]}')
|
| 505 |
+
|
| 506 |
+
export_content = f'''YouTube Video Transcript Chatbot
|
| 507 |
+
Transcript:
|
| 508 |
+
{q.client.transcript}
|
| 509 |
+
|
| 510 |
+
Chat History:
|
| 511 |
+
{"\n".join(chat_history)}'''
|
| 512 |
+
|
| 513 |
+
q.page['export'] = ui.form_card(
|
| 514 |
+
box='1 13 12 2',
|
| 515 |
+
items=[
|
| 516 |
+
ui.text_area(
|
| 517 |
+
name='export_content',
|
| 518 |
+
label='Chat Export (Copy and save)',
|
| 519 |
+
value=export_content,
|
| 520 |
+
height='200px'
|
| 521 |
+
)
|
| 522 |
+
]
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
+
# Handle chat messages
|
| 526 |
+
if q.args.chatbot:
|
| 527 |
+
# Add user message with "User:" prefix
|
| 528 |
+
user_message = f"User: {q.args.chatbot}"
|
| 529 |
+
q.page['chat'].data += [user_message, True]
|
| 530 |
+
await q.page.save()
|
| 531 |
+
|
| 532 |
+
if hasattr(q.client, 'collection_id'):
|
| 533 |
+
response = await get_gpt_response(q.client.collection_id, q.args.chatbot)
|
| 534 |
+
# Add response with "Response:" prefix
|
| 535 |
+
formatted_response = f"Response: {response}"
|
| 536 |
+
q.page['chat'].data += [formatted_response, False]
|
| 537 |
+
else:
|
| 538 |
+
# Add error message with "Response:" prefix
|
| 539 |
+
q.page['chat'].data += ['Response: Please fetch a video transcript first.', False]
|
| 540 |
+
|
| 541 |
+
# Handle feedback
|
| 542 |
+
if q.events.chatbot and q.events.chatbot.feedback:
|
| 543 |
+
feedback = q.events.chatbot.feedback
|
| 544 |
+
q.page['feedback'].items[0][1].content = f'Latest feedback: {feedback.type} on "{feedback.message}"'
|
| 545 |
+
|
| 546 |
+
await q.page.save()
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
h2o-wave
|
| 2 |
+
youtube-transcript-api
|
| 3 |
+
python-dotenv
|
| 4 |
+
h2ogpte
|
| 5 |
+
nltk
|
| 6 |
+
textblob
|