Upload 29 files
Browse files- src/config/__init__.py +1 -0
- src/config/__pycache__/__init__.cpython-310.pyc +0 -0
- src/config/__pycache__/settings.cpython-310.pyc +0 -0
- src/config/config.yaml +47 -0
- src/config/huggingface.yaml +16 -0
- src/config/settings.py +183 -0
- src/logs/app.log +145 -0
- src/logs/youtube_chatbot_20250714.log +12 -0
- src/src/__init__.py +3 -0
- src/src/__pycache__/__init__.cpython-310.pyc +0 -0
- src/src/utils/__init__.py +1 -0
- src/src/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- src/src/utils/__pycache__/export_utils.cpython-310.pyc +0 -0
- src/src/utils/__pycache__/logger.cpython-310.pyc +0 -0
- src/src/utils/__pycache__/session_manager.cpython-310.pyc +0 -0
- src/src/utils/__pycache__/text_processor.cpython-310.pyc +0 -0
- src/src/utils/__pycache__/youtube_handler.cpython-310.pyc +0 -0
- src/src/utils/cache_manager.py +374 -0
- src/src/utils/database.py +373 -0
- src/src/utils/export_utils.py +262 -0
- src/src/utils/logger.py +46 -0
- src/src/utils/session_manager.py +182 -0
- src/src/utils/text_processor.py +377 -0
- src/src/utils/youtube_handler.py +369 -0
- src/static/style.css +501 -0
- src/tests/__init__.py +1 -0
- src/tests/test_session_manager.py +272 -0
- src/tests/test_text_processor.py +217 -0
- src/tests/test_youtube_handler.py +115 -0
src/config/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Configuration package
|
src/config/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (157 Bytes). View file
|
|
|
src/config/__pycache__/settings.cpython-310.pyc
ADDED
|
Binary file (5.8 kB). View file
|
|
|
src/config/config.yaml
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# YouTube Transcript Chatbot Configuration
|
| 2 |
+
|
| 3 |
+
app:
|
| 4 |
+
title: "AI-Powered YouTube Transcript Tutor"
|
| 5 |
+
description: "Ask questions from YouTube lecture transcripts using AI"
|
| 6 |
+
version: "1.0.0"
|
| 7 |
+
|
| 8 |
+
ui:
|
| 9 |
+
theme: "light" # light, dark, auto
|
| 10 |
+
sidebar_width: 300
|
| 11 |
+
max_chat_history_display: 50
|
| 12 |
+
enable_animations: true
|
| 13 |
+
|
| 14 |
+
processing:
|
| 15 |
+
default_chunk_size: 1000
|
| 16 |
+
chunk_overlap: 200
|
| 17 |
+
max_transcript_length: 1000000 # 1MB
|
| 18 |
+
supported_languages: ["en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh"]
|
| 19 |
+
default_language: "en"
|
| 20 |
+
|
| 21 |
+
ai:
|
| 22 |
+
model_temperature: 0.7
|
| 23 |
+
max_tokens: 2000
|
| 24 |
+
retrieval_k: 4 # Number of documents to retrieve
|
| 25 |
+
chain_type: "stuff" # stuff, map_reduce, refine, map_rerank
|
| 26 |
+
|
| 27 |
+
export:
|
| 28 |
+
formats: ["pdf", "txt", "json"]
|
| 29 |
+
max_export_entries: 1000
|
| 30 |
+
pdf_page_size: "A4"
|
| 31 |
+
|
| 32 |
+
cache:
|
| 33 |
+
enable_vectorstore_cache: true
|
| 34 |
+
cache_directory: "cache"
|
| 35 |
+
max_cache_size_mb: 500
|
| 36 |
+
|
| 37 |
+
logging:
|
| 38 |
+
level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
|
| 39 |
+
file: "logs/app.log"
|
| 40 |
+
max_file_size_mb: 10
|
| 41 |
+
backup_count: 5
|
| 42 |
+
|
| 43 |
+
security:
|
| 44 |
+
max_url_length: 2048
|
| 45 |
+
allowed_domains: ["youtube.com", "youtu.be", "m.youtube.com"]
|
| 46 |
+
rate_limit_requests: 100
|
| 47 |
+
rate_limit_window_minutes: 60
|
src/config/huggingface.yaml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face Spaces specific configuration
|
| 2 |
+
|
| 3 |
+
app:
|
| 4 |
+
title: "AI-Powered YouTube Transcript Tutor"
|
| 5 |
+
description: "Ask questions from YouTube lecture transcripts using AI"
|
| 6 |
+
|
| 7 |
+
logging:
|
| 8 |
+
level: "INFO"
|
| 9 |
+
file: null # Disable file logging
|
| 10 |
+
|
| 11 |
+
cache:
|
| 12 |
+
enable_vectorstore_cache: false
|
| 13 |
+
cache_directory: null
|
| 14 |
+
|
| 15 |
+
security:
|
| 16 |
+
youtube_api_fallback: true # Enable fallback methods for YouTube API
|
src/config/settings.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration settings management.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import yaml
|
| 7 |
+
from typing import Dict, Any, Optional
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
class Settings:
|
| 11 |
+
"""Application settings manager."""
|
| 12 |
+
|
| 13 |
+
def __init__(self, config_file: str = "config/config.yaml"):
|
| 14 |
+
"""
|
| 15 |
+
Initialize settings from config file and environment variables.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
config_file (str): Path to configuration file
|
| 19 |
+
"""
|
| 20 |
+
self.config_file = config_file
|
| 21 |
+
self.config = self._load_config()
|
| 22 |
+
self._override_with_env()
|
| 23 |
+
|
| 24 |
+
def _load_config(self) -> Dict[str, Any]:
|
| 25 |
+
"""Load configuration from YAML file."""
|
| 26 |
+
try:
|
| 27 |
+
config_path = Path(self.config_file)
|
| 28 |
+
if config_path.exists():
|
| 29 |
+
with open(config_path, 'r', encoding='utf-8') as f:
|
| 30 |
+
return yaml.safe_load(f) or {}
|
| 31 |
+
else:
|
| 32 |
+
return self._get_default_config()
|
| 33 |
+
except Exception as e:
|
| 34 |
+
print(f"Error loading config file: {e}")
|
| 35 |
+
return self._get_default_config()
|
| 36 |
+
|
| 37 |
+
def _get_default_config(self) -> Dict[str, Any]:
|
| 38 |
+
"""Get default configuration."""
|
| 39 |
+
return {
|
| 40 |
+
'app': {
|
| 41 |
+
'title': 'AI-Powered YouTube Transcript Tutor',
|
| 42 |
+
'description': 'Ask questions from YouTube lecture transcripts using AI',
|
| 43 |
+
'version': '1.0.0'
|
| 44 |
+
},
|
| 45 |
+
'ui': {
|
| 46 |
+
'theme': 'light',
|
| 47 |
+
'sidebar_width': 300,
|
| 48 |
+
'max_chat_history_display': 50,
|
| 49 |
+
'enable_animations': True
|
| 50 |
+
},
|
| 51 |
+
'processing': {
|
| 52 |
+
'default_chunk_size': 1000,
|
| 53 |
+
'chunk_overlap': 200,
|
| 54 |
+
'max_transcript_length': 1000000,
|
| 55 |
+
'supported_languages': ['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh'],
|
| 56 |
+
'default_language': 'en'
|
| 57 |
+
},
|
| 58 |
+
'ai': {
|
| 59 |
+
'model_temperature': 0.7,
|
| 60 |
+
'max_tokens': 2000,
|
| 61 |
+
'retrieval_k': 4,
|
| 62 |
+
'chain_type': 'stuff'
|
| 63 |
+
},
|
| 64 |
+
'export': {
|
| 65 |
+
'formats': ['pdf', 'txt', 'json'],
|
| 66 |
+
'max_export_entries': 1000,
|
| 67 |
+
'pdf_page_size': 'A4'
|
| 68 |
+
},
|
| 69 |
+
'cache': {
|
| 70 |
+
'enable_vectorstore_cache': True,
|
| 71 |
+
'cache_directory': 'cache',
|
| 72 |
+
'max_cache_size_mb': 500
|
| 73 |
+
},
|
| 74 |
+
'logging': {
|
| 75 |
+
'level': 'INFO',
|
| 76 |
+
'file': 'logs/app.log',
|
| 77 |
+
'max_file_size_mb': 10,
|
| 78 |
+
'backup_count': 5
|
| 79 |
+
},
|
| 80 |
+
'security': {
|
| 81 |
+
'max_url_length': 2048,
|
| 82 |
+
'allowed_domains': ['youtube.com', 'youtu.be', 'm.youtube.com'],
|
| 83 |
+
'rate_limit_requests': 100,
|
| 84 |
+
'rate_limit_window_minutes': 60
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
def _override_with_env(self):
|
| 89 |
+
"""Override configuration with environment variables."""
|
| 90 |
+
# OpenAI API Key
|
| 91 |
+
openai_key = os.getenv('OPENAI_API_KEY')
|
| 92 |
+
if openai_key:
|
| 93 |
+
if 'ai' not in self.config:
|
| 94 |
+
self.config['ai'] = {}
|
| 95 |
+
self.config['ai']['openai_api_key'] = openai_key
|
| 96 |
+
|
| 97 |
+
# Log level
|
| 98 |
+
log_level = os.getenv('LOG_LEVEL')
|
| 99 |
+
if log_level:
|
| 100 |
+
self.config['logging']['level'] = log_level.upper()
|
| 101 |
+
|
| 102 |
+
# Cache directory
|
| 103 |
+
cache_dir = os.getenv('CACHE_DIRECTORY')
|
| 104 |
+
if cache_dir:
|
| 105 |
+
self.config['cache']['cache_directory'] = cache_dir
|
| 106 |
+
|
| 107 |
+
def get(self, key: str, default: Any = None) -> Any:
|
| 108 |
+
"""
|
| 109 |
+
Get configuration value using dot notation.
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
key (str): Configuration key (e.g., 'app.title')
|
| 113 |
+
default (Any): Default value if key not found
|
| 114 |
+
|
| 115 |
+
Returns:
|
| 116 |
+
Any: Configuration value
|
| 117 |
+
"""
|
| 118 |
+
keys = key.split('.')
|
| 119 |
+
value = self.config
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
for k in keys:
|
| 123 |
+
value = value[k]
|
| 124 |
+
return value
|
| 125 |
+
except (KeyError, TypeError):
|
| 126 |
+
return default
|
| 127 |
+
|
| 128 |
+
def set(self, key: str, value: Any):
|
| 129 |
+
"""
|
| 130 |
+
Set configuration value using dot notation.
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
key (str): Configuration key (e.g., 'app.title')
|
| 134 |
+
value (Any): Value to set
|
| 135 |
+
"""
|
| 136 |
+
keys = key.split('.')
|
| 137 |
+
config = self.config
|
| 138 |
+
|
| 139 |
+
for k in keys[:-1]:
|
| 140 |
+
if k not in config:
|
| 141 |
+
config[k] = {}
|
| 142 |
+
config = config[k]
|
| 143 |
+
|
| 144 |
+
config[keys[-1]] = value
|
| 145 |
+
|
| 146 |
+
def get_openai_api_key(self) -> Optional[str]:
|
| 147 |
+
"""Get OpenAI API key from config or environment."""
|
| 148 |
+
return self.get('ai.openai_api_key') or os.getenv('OPENAI_API_KEY')
|
| 149 |
+
|
| 150 |
+
def get_app_config(self) -> Dict[str, Any]:
|
| 151 |
+
"""Get application configuration."""
|
| 152 |
+
return self.get('app', {})
|
| 153 |
+
|
| 154 |
+
def get_ui_config(self) -> Dict[str, Any]:
|
| 155 |
+
"""Get UI configuration."""
|
| 156 |
+
return self.get('ui', {})
|
| 157 |
+
|
| 158 |
+
def get_processing_config(self) -> Dict[str, Any]:
|
| 159 |
+
"""Get processing configuration."""
|
| 160 |
+
return self.get('processing', {})
|
| 161 |
+
|
| 162 |
+
def get_ai_config(self) -> Dict[str, Any]:
|
| 163 |
+
"""Get AI configuration."""
|
| 164 |
+
return self.get('ai', {})
|
| 165 |
+
|
| 166 |
+
def get_export_config(self) -> Dict[str, Any]:
|
| 167 |
+
"""Get export configuration."""
|
| 168 |
+
return self.get('export', {})
|
| 169 |
+
|
| 170 |
+
def get_cache_config(self) -> Dict[str, Any]:
|
| 171 |
+
"""Get cache configuration."""
|
| 172 |
+
return self.get('cache', {})
|
| 173 |
+
|
| 174 |
+
def get_logging_config(self) -> Dict[str, Any]:
|
| 175 |
+
"""Get logging configuration."""
|
| 176 |
+
return self.get('logging', {})
|
| 177 |
+
|
| 178 |
+
def get_security_config(self) -> Dict[str, Any]:
|
| 179 |
+
"""Get security configuration."""
|
| 180 |
+
return self.get('security', {})
|
| 181 |
+
|
| 182 |
+
# Global settings instance
|
| 183 |
+
settings = Settings()
|
src/logs/app.log
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-07-13 22:55:30,859 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 2 |
+
2025-07-13 22:55:37,615 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
|
| 3 |
+
2025-07-13 22:56:36,886 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 4 |
+
2025-07-13 22:56:40,977 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
|
| 5 |
+
2025-07-13 22:56:54,360 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 6 |
+
2025-07-13 22:57:04,282 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
|
| 7 |
+
2025-07-13 22:58:12,592 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 8 |
+
2025-07-13 22:58:21,552 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
|
| 9 |
+
2025-07-13 22:58:38,183 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 10 |
+
2025-07-13 22:58:41,834 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
|
| 11 |
+
2025-07-13 22:59:16,207 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 12 |
+
2025-07-13 22:59:22,975 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 13 |
+
2025-07-13 22:59:23,716 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
|
| 14 |
+
2025-07-13 22:59:26,323 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
|
| 15 |
+
2025-07-13 23:01:13,950 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 16 |
+
2025-07-13 23:01:35,772 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 17 |
+
2025-07-13 23:01:35,774 - openai._base_client - INFO - Retrying request to /embeddings in 0.378161 seconds
|
| 18 |
+
2025-07-13 23:01:37,503 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 19 |
+
2025-07-13 23:01:37,505 - openai._base_client - INFO - Retrying request to /embeddings in 0.796060 seconds
|
| 20 |
+
2025-07-13 23:01:39,284 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 21 |
+
2025-07-13 23:01:39,286 - src.utils.text_processor - ERROR - Error creating vector store: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
|
| 22 |
+
2025-07-13 23:02:22,588 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 23 |
+
2025-07-13 23:02:36,283 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 24 |
+
2025-07-13 23:02:36,285 - openai._base_client - INFO - Retrying request to /embeddings in 0.379324 seconds
|
| 25 |
+
2025-07-13 23:02:37,475 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 26 |
+
2025-07-13 23:02:37,476 - openai._base_client - INFO - Retrying request to /embeddings in 0.943958 seconds
|
| 27 |
+
2025-07-13 23:02:39,327 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 28 |
+
2025-07-13 23:02:39,328 - src.utils.text_processor - ERROR - Error creating vector store: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
|
| 29 |
+
2025-07-13 23:09:22,969 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 30 |
+
2025-07-13 23:09:26,985 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 31 |
+
2025-07-13 23:09:26,986 - openai._base_client - INFO - Retrying request to /embeddings in 0.395765 seconds
|
| 32 |
+
2025-07-13 23:09:27,911 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 33 |
+
2025-07-13 23:09:27,913 - openai._base_client - INFO - Retrying request to /embeddings in 0.940555 seconds
|
| 34 |
+
2025-07-13 23:09:29,552 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 35 |
+
2025-07-13 23:09:29,554 - src.utils.text_processor - WARNING - OpenAI embeddings failed: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
|
| 36 |
+
2025-07-13 23:09:29,554 - src.utils.text_processor - INFO - Using simple text-based fallback
|
| 37 |
+
2025-07-13 23:09:29,554 - src.utils.text_processor - INFO - Created simple text-based fallback vector store
|
| 38 |
+
2025-07-13 23:09:29,555 - src.utils.text_processor - INFO - Using simple fallback QA system
|
| 39 |
+
2025-07-13 23:15:01,397 - src.utils.logger - INFO - Custom CSS loaded successfully
|
| 40 |
+
2025-07-13 23:15:05,056 - src.utils.logger - INFO - Custom CSS loaded successfully
|
| 41 |
+
2025-07-13 23:15:15,923 - src.utils.logger - INFO - Custom CSS loaded successfully
|
| 42 |
+
2025-07-13 23:15:17,491 - src.utils.logger - INFO - Custom CSS loaded successfully
|
| 43 |
+
2025-07-13 23:15:19,654 - src.utils.logger - INFO - Custom CSS loaded successfully
|
| 44 |
+
2025-07-13 23:15:23,535 - src.utils.logger - INFO - Custom CSS loaded successfully
|
| 45 |
+
2025-07-13 23:16:06,979 - src.utils.logger - INFO - Custom CSS loaded successfully
|
| 46 |
+
2025-07-13 23:21:32,219 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 47 |
+
2025-07-13 23:21:43,012 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 48 |
+
2025-07-13 23:21:43,013 - openai._base_client - INFO - Retrying request to /embeddings in 0.396331 seconds
|
| 49 |
+
2025-07-13 23:21:44,678 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 50 |
+
2025-07-13 23:21:44,680 - openai._base_client - INFO - Retrying request to /embeddings in 0.842338 seconds
|
| 51 |
+
2025-07-13 23:21:47,127 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 52 |
+
2025-07-13 23:21:47,128 - src.utils.text_processor - WARNING - OpenAI embeddings failed: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
|
| 53 |
+
2025-07-13 23:21:47,129 - src.utils.text_processor - INFO - Using simple text-based fallback
|
| 54 |
+
2025-07-13 23:21:47,129 - src.utils.text_processor - INFO - Created simple text-based fallback vector store
|
| 55 |
+
2025-07-13 23:21:47,129 - src.utils.text_processor - INFO - Using simple fallback QA system
|
| 56 |
+
2025-07-13 23:22:46,498 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 57 |
+
2025-07-13 23:22:49,535 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 58 |
+
2025-07-13 23:53:47,078 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 59 |
+
2025-07-13 23:53:52,909 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 60 |
+
2025-07-13 23:53:59,444 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 61 |
+
2025-07-13 23:53:59,609 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 62 |
+
2025-07-13 23:54:00,519 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 63 |
+
2025-07-13 23:54:07,685 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 64 |
+
2025-07-13 23:54:07,688 - openai._base_client - INFO - Retrying request to /embeddings in 0.454709 seconds
|
| 65 |
+
2025-07-13 23:54:08,673 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 66 |
+
2025-07-13 23:54:08,674 - openai._base_client - INFO - Retrying request to /embeddings in 0.918276 seconds
|
| 67 |
+
2025-07-13 23:54:10,652 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 68 |
+
2025-07-13 23:54:10,656 - src.utils.text_processor - WARNING - OpenAI embeddings failed: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
|
| 69 |
+
2025-07-13 23:54:10,657 - src.utils.text_processor - INFO - Using simple text-based fallback
|
| 70 |
+
2025-07-13 23:54:10,659 - src.utils.text_processor - INFO - Created simple text-based fallback vector store
|
| 71 |
+
2025-07-13 23:54:10,660 - src.utils.text_processor - INFO - Using simple fallback QA system
|
| 72 |
+
2025-07-13 23:54:22,185 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 73 |
+
2025-07-13 23:54:24,094 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 74 |
+
2025-07-14 00:42:29,448 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 75 |
+
2025-07-14 00:42:47,515 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 76 |
+
2025-07-14 00:42:47,673 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 77 |
+
2025-07-14 00:42:48,260 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 78 |
+
2025-07-14 00:42:52,222 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 79 |
+
2025-07-14 00:42:52,224 - openai._base_client - INFO - Retrying request to /embeddings in 0.396998 seconds
|
| 80 |
+
2025-07-14 00:42:53,417 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 81 |
+
2025-07-14 00:42:53,419 - openai._base_client - INFO - Retrying request to /embeddings in 0.829603 seconds
|
| 82 |
+
2025-07-14 00:42:54,708 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 83 |
+
2025-07-14 00:42:54,713 - src.utils.text_processor - WARNING - OpenAI embeddings failed: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
|
| 84 |
+
2025-07-14 00:42:54,715 - src.utils.text_processor - INFO - Using simple text-based fallback
|
| 85 |
+
2025-07-14 00:42:54,717 - src.utils.text_processor - INFO - Created simple text-based fallback vector store
|
| 86 |
+
2025-07-14 00:42:54,718 - src.utils.text_processor - INFO - Using simple fallback QA system
|
| 87 |
+
2025-07-14 00:50:26,573 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 88 |
+
2025-07-14 01:00:15,758 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 89 |
+
2025-07-14 01:00:23,869 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 90 |
+
2025-07-14 01:00:24,021 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 91 |
+
2025-07-14 01:00:24,480 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 92 |
+
2025-07-14 01:00:26,491 - src.utils.youtube_handler - INFO - Successfully got transcript in en
|
| 93 |
+
2025-07-14 01:00:28,434 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 94 |
+
2025-07-14 01:00:28,436 - openai._base_client - INFO - Retrying request to /embeddings in 0.464677 seconds
|
| 95 |
+
2025-07-14 01:00:29,888 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 96 |
+
2025-07-14 01:00:29,889 - openai._base_client - INFO - Retrying request to /embeddings in 0.932156 seconds
|
| 97 |
+
2025-07-14 01:00:31,765 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
|
| 98 |
+
2025-07-14 01:00:31,768 - src.utils.text_processor - WARNING - OpenAI embeddings failed: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
|
| 99 |
+
2025-07-14 01:00:31,770 - src.utils.text_processor - INFO - Using simple text-based fallback
|
| 100 |
+
2025-07-14 01:00:31,771 - src.utils.text_processor - INFO - Created simple text-based fallback vector store
|
| 101 |
+
2025-07-14 01:00:31,772 - src.utils.text_processor - INFO - Using simple fallback QA system
|
| 102 |
+
2025-07-14 01:01:09,650 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 103 |
+
2025-07-14 01:01:32,106 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 104 |
+
2025-07-14 01:01:32,361 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 105 |
+
2025-07-14 01:01:33,814 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 106 |
+
2025-07-14 01:01:33,972 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 107 |
+
2025-07-14 01:01:34,156 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 108 |
+
2025-07-14 01:01:34,479 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 109 |
+
2025-07-14 01:01:34,750 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 110 |
+
2025-07-14 01:01:34,892 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 111 |
+
2025-07-14 01:01:35,077 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 112 |
+
2025-07-14 01:01:35,399 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 113 |
+
2025-07-14 01:01:38,181 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 114 |
+
2025-07-14 01:01:38,429 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 115 |
+
2025-07-14 01:01:39,552 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 116 |
+
2025-07-14 01:01:39,800 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 117 |
+
2025-07-14 01:01:39,822 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 118 |
+
2025-07-14 01:01:39,989 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 119 |
+
2025-07-14 01:01:40,370 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 120 |
+
2025-07-14 01:01:41,960 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 121 |
+
2025-07-14 01:14:18,892 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 122 |
+
2025-07-14 01:14:53,458 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 123 |
+
2025-07-14 01:14:53,729 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 124 |
+
2025-07-14 01:14:54,391 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 125 |
+
2025-07-14 01:14:54,504 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 126 |
+
2025-07-14 01:14:54,695 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 127 |
+
2025-07-14 01:14:54,820 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 128 |
+
2025-07-14 01:14:55,091 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 129 |
+
2025-07-14 01:14:55,861 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 130 |
+
2025-07-14 01:14:55,967 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 131 |
+
2025-07-14 01:14:56,154 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 132 |
+
2025-07-14 01:14:56,488 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 133 |
+
2025-07-14 01:14:56,756 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 134 |
+
2025-07-14 01:14:56,902 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 135 |
+
2025-07-14 01:14:57,250 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 136 |
+
2025-07-14 01:14:58,213 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 137 |
+
2025-07-14 01:14:58,299 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 138 |
+
2025-07-14 01:14:58,452 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 139 |
+
2025-07-14 01:14:58,584 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 140 |
+
2025-07-14 01:14:58,780 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 141 |
+
2025-07-14 01:14:59,089 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 142 |
+
2025-07-14 01:14:59,926 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 143 |
+
2025-07-14 01:15:00,032 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 144 |
+
2025-07-14 01:15:00,208 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
| 145 |
+
2025-07-14 01:15:00,533 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
|
src/logs/youtube_chatbot_20250714.log
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-07-14 00:58:58,526 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 2 |
+
2025-07-14 00:59:00,597 - src.utils.youtube_handler - INFO - Successfully got transcript in en
|
| 3 |
+
2025-07-14 00:59:00,784 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 4 |
+
2025-07-14 00:59:02,824 - src.utils.youtube_handler - INFO - Successfully got transcript in en
|
| 5 |
+
2025-07-14 00:59:03,041 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 6 |
+
2025-07-14 00:59:05,007 - src.utils.youtube_handler - INFO - Successfully got transcript in en
|
| 7 |
+
2025-07-14 00:59:05,209 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 8 |
+
2025-07-14 00:59:07,507 - src.utils.youtube_handler - INFO - Successfully got transcript in en
|
| 9 |
+
2025-07-14 01:12:20,696 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
|
| 10 |
+
2025-07-14 01:12:20,697 - src.utils.youtube_handler - INFO - Rate limiting: sleeping for 2.69 seconds
|
| 11 |
+
2025-07-14 01:12:24,675 - src.utils.youtube_handler - INFO - Rate limiting: sleeping for 1.72 seconds
|
| 12 |
+
2025-07-14 01:12:27,701 - src.utils.youtube_handler - INFO - Successfully got transcript in en on attempt 1
|
src/src/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# YouTube Transcript Chatbot Package
|
| 2 |
+
__version__ = "1.0.0"
|
| 3 |
+
__author__ = "YouTube Transcript Chatbot Team"
|
src/src/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (226 Bytes). View file
|
|
|
src/src/utils/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Utilities package
|
src/src/utils/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (160 Bytes). View file
|
|
|
src/src/utils/__pycache__/export_utils.cpython-310.pyc
ADDED
|
Binary file (7.4 kB). View file
|
|
|
src/src/utils/__pycache__/logger.cpython-310.pyc
ADDED
|
Binary file (1.38 kB). View file
|
|
|
src/src/utils/__pycache__/session_manager.cpython-310.pyc
ADDED
|
Binary file (6.49 kB). View file
|
|
|
src/src/utils/__pycache__/text_processor.cpython-310.pyc
ADDED
|
Binary file (12.7 kB). View file
|
|
|
src/src/utils/__pycache__/youtube_handler.cpython-310.pyc
ADDED
|
Binary file (10.8 kB). View file
|
|
|
src/src/utils/cache_manager.py
ADDED
|
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cache management utilities for vector stores and processed data.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import pickle
|
| 7 |
+
import hashlib
|
| 8 |
+
import logging
|
| 9 |
+
from datetime import datetime, timedelta
|
| 10 |
+
from typing import Any, Optional, Dict
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import shutil
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
class CacheManager:
|
| 17 |
+
"""Manages caching of vector stores and processed data."""
|
| 18 |
+
|
| 19 |
+
def __init__(self, cache_dir: str = "cache", max_size_mb: int = 500):
|
| 20 |
+
"""
|
| 21 |
+
Initialize cache manager.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
cache_dir (str): Cache directory path
|
| 25 |
+
max_size_mb (int): Maximum cache size in MB
|
| 26 |
+
"""
|
| 27 |
+
self.cache_dir = Path(cache_dir)
|
| 28 |
+
self.max_size_bytes = max_size_mb * 1024 * 1024
|
| 29 |
+
self.ensure_cache_directory()
|
| 30 |
+
|
| 31 |
+
def ensure_cache_directory(self):
|
| 32 |
+
"""Ensure cache directory exists."""
|
| 33 |
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
| 34 |
+
|
| 35 |
+
# Create subdirectories
|
| 36 |
+
(self.cache_dir / "vectorstores").mkdir(exist_ok=True)
|
| 37 |
+
(self.cache_dir / "transcripts").mkdir(exist_ok=True)
|
| 38 |
+
(self.cache_dir / "metadata").mkdir(exist_ok=True)
|
| 39 |
+
|
| 40 |
+
def _get_cache_key(self, data: str) -> str:
|
| 41 |
+
"""
|
| 42 |
+
Generate cache key from data.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
data (str): Data to generate key for
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
str: Cache key
|
| 49 |
+
"""
|
| 50 |
+
return hashlib.md5(data.encode()).hexdigest()
|
| 51 |
+
|
| 52 |
+
def _get_cache_path(self, cache_type: str, key: str) -> Path:
|
| 53 |
+
"""
|
| 54 |
+
Get cache file path.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
cache_type (str): Type of cache (vectorstores, transcripts, metadata)
|
| 58 |
+
key (str): Cache key
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
Path: Cache file path
|
| 62 |
+
"""
|
| 63 |
+
return self.cache_dir / cache_type / f"{key}.pkl"
|
| 64 |
+
|
| 65 |
+
def save_vectorstore(self, video_id: str, vectorstore: Any) -> bool:
|
| 66 |
+
"""
|
| 67 |
+
Save vector store to cache.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
video_id (str): Video ID
|
| 71 |
+
vectorstore (Any): Vector store object
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
bool: True if successful, False otherwise
|
| 75 |
+
"""
|
| 76 |
+
try:
|
| 77 |
+
cache_key = self._get_cache_key(video_id)
|
| 78 |
+
cache_path = self._get_cache_path("vectorstores", cache_key)
|
| 79 |
+
|
| 80 |
+
# Save vector store using FAISS's built-in save method
|
| 81 |
+
vectorstore.save_local(str(cache_path.with_suffix("")))
|
| 82 |
+
|
| 83 |
+
# Save metadata
|
| 84 |
+
metadata = {
|
| 85 |
+
'video_id': video_id,
|
| 86 |
+
'created_at': datetime.now().isoformat(),
|
| 87 |
+
'cache_key': cache_key
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
metadata_path = self._get_cache_path("metadata", cache_key)
|
| 91 |
+
with open(metadata_path, 'wb') as f:
|
| 92 |
+
pickle.dump(metadata, f)
|
| 93 |
+
|
| 94 |
+
logger.info(f"Vector store cached for video {video_id}")
|
| 95 |
+
self._cleanup_cache()
|
| 96 |
+
return True
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
logger.error(f"Error caching vector store: {e}")
|
| 100 |
+
return False
|
| 101 |
+
|
| 102 |
+
def load_vectorstore(self, video_id: str, embeddings: Any) -> Optional[Any]:
|
| 103 |
+
"""
|
| 104 |
+
Load vector store from cache.
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
video_id (str): Video ID
|
| 108 |
+
embeddings (Any): Embeddings object for loading
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
Optional[Any]: Vector store object or None if not found
|
| 112 |
+
"""
|
| 113 |
+
try:
|
| 114 |
+
cache_key = self._get_cache_key(video_id)
|
| 115 |
+
cache_path = self._get_cache_path("vectorstores", cache_key)
|
| 116 |
+
|
| 117 |
+
if not cache_path.with_suffix("").exists():
|
| 118 |
+
return None
|
| 119 |
+
|
| 120 |
+
# Load vector store using FAISS's built-in load method
|
| 121 |
+
from langchain_community.vectorstores import FAISS
|
| 122 |
+
vectorstore = FAISS.load_local(str(cache_path.with_suffix("")), embeddings)
|
| 123 |
+
|
| 124 |
+
logger.info(f"Vector store loaded from cache for video {video_id}")
|
| 125 |
+
return vectorstore
|
| 126 |
+
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.error(f"Error loading vector store from cache: {e}")
|
| 129 |
+
return None
|
| 130 |
+
|
| 131 |
+
def save_transcript(self, video_id: str, transcript_data: Dict[str, Any]) -> bool:
|
| 132 |
+
"""
|
| 133 |
+
Save transcript data to cache.
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
video_id (str): Video ID
|
| 137 |
+
transcript_data (Dict[str, Any]): Transcript data
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
bool: True if successful, False otherwise
|
| 141 |
+
"""
|
| 142 |
+
try:
|
| 143 |
+
cache_key = self._get_cache_key(video_id)
|
| 144 |
+
cache_path = self._get_cache_path("transcripts", cache_key)
|
| 145 |
+
|
| 146 |
+
cache_data = {
|
| 147 |
+
'video_id': video_id,
|
| 148 |
+
'transcript_data': transcript_data,
|
| 149 |
+
'created_at': datetime.now().isoformat(),
|
| 150 |
+
'cache_key': cache_key
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
with open(cache_path, 'wb') as f:
|
| 154 |
+
pickle.dump(cache_data, f)
|
| 155 |
+
|
| 156 |
+
logger.info(f"Transcript cached for video {video_id}")
|
| 157 |
+
self._cleanup_cache()
|
| 158 |
+
return True
|
| 159 |
+
|
| 160 |
+
except Exception as e:
|
| 161 |
+
logger.error(f"Error caching transcript: {e}")
|
| 162 |
+
return False
|
| 163 |
+
|
| 164 |
+
def load_transcript(self, video_id: str) -> Optional[Dict[str, Any]]:
|
| 165 |
+
"""
|
| 166 |
+
Load transcript data from cache.
|
| 167 |
+
|
| 168 |
+
Args:
|
| 169 |
+
video_id (str): Video ID
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
Optional[Dict[str, Any]]: Transcript data or None if not found
|
| 173 |
+
"""
|
| 174 |
+
try:
|
| 175 |
+
cache_key = self._get_cache_key(video_id)
|
| 176 |
+
cache_path = self._get_cache_path("transcripts", cache_key)
|
| 177 |
+
|
| 178 |
+
if not cache_path.exists():
|
| 179 |
+
return None
|
| 180 |
+
|
| 181 |
+
with open(cache_path, 'rb') as f:
|
| 182 |
+
cache_data = pickle.load(f)
|
| 183 |
+
|
| 184 |
+
logger.info(f"Transcript loaded from cache for video {video_id}")
|
| 185 |
+
return cache_data['transcript_data']
|
| 186 |
+
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.error(f"Error loading transcript from cache: {e}")
|
| 189 |
+
return None
|
| 190 |
+
|
| 191 |
+
def is_cached(self, video_id: str, cache_type: str = "vectorstores") -> bool:
|
| 192 |
+
"""
|
| 193 |
+
Check if data is cached for video.
|
| 194 |
+
|
| 195 |
+
Args:
|
| 196 |
+
video_id (str): Video ID
|
| 197 |
+
cache_type (str): Type of cache to check
|
| 198 |
+
|
| 199 |
+
Returns:
|
| 200 |
+
bool: True if cached, False otherwise
|
| 201 |
+
"""
|
| 202 |
+
try:
|
| 203 |
+
cache_key = self._get_cache_key(video_id)
|
| 204 |
+
|
| 205 |
+
if cache_type == "vectorstores":
|
| 206 |
+
cache_path = self._get_cache_path("vectorstores", cache_key)
|
| 207 |
+
return cache_path.with_suffix("").exists()
|
| 208 |
+
else:
|
| 209 |
+
cache_path = self._get_cache_path(cache_type, cache_key)
|
| 210 |
+
return cache_path.exists()
|
| 211 |
+
|
| 212 |
+
except Exception as e:
|
| 213 |
+
logger.error(f"Error checking cache: {e}")
|
| 214 |
+
return False
|
| 215 |
+
|
| 216 |
+
def delete_cache(self, video_id: str) -> bool:
|
| 217 |
+
"""
|
| 218 |
+
Delete cached data for video.
|
| 219 |
+
|
| 220 |
+
Args:
|
| 221 |
+
video_id (str): Video ID
|
| 222 |
+
|
| 223 |
+
Returns:
|
| 224 |
+
bool: True if successful, False otherwise
|
| 225 |
+
"""
|
| 226 |
+
try:
|
| 227 |
+
cache_key = self._get_cache_key(video_id)
|
| 228 |
+
|
| 229 |
+
# Delete vector store cache
|
| 230 |
+
vectorstore_path = self._get_cache_path("vectorstores", cache_key)
|
| 231 |
+
if vectorstore_path.with_suffix("").exists():
|
| 232 |
+
shutil.rmtree(vectorstore_path.with_suffix(""))
|
| 233 |
+
|
| 234 |
+
# Delete transcript cache
|
| 235 |
+
transcript_path = self._get_cache_path("transcripts", cache_key)
|
| 236 |
+
if transcript_path.exists():
|
| 237 |
+
transcript_path.unlink()
|
| 238 |
+
|
| 239 |
+
# Delete metadata cache
|
| 240 |
+
metadata_path = self._get_cache_path("metadata", cache_key)
|
| 241 |
+
if metadata_path.exists():
|
| 242 |
+
metadata_path.unlink()
|
| 243 |
+
|
| 244 |
+
logger.info(f"Cache deleted for video {video_id}")
|
| 245 |
+
return True
|
| 246 |
+
|
| 247 |
+
except Exception as e:
|
| 248 |
+
logger.error(f"Error deleting cache: {e}")
|
| 249 |
+
return False
|
| 250 |
+
|
| 251 |
+
def get_cache_size(self) -> Dict[str, Any]:
|
| 252 |
+
"""
|
| 253 |
+
Get cache size information.
|
| 254 |
+
|
| 255 |
+
Returns:
|
| 256 |
+
Dict[str, Any]: Cache size information
|
| 257 |
+
"""
|
| 258 |
+
try:
|
| 259 |
+
total_size = 0
|
| 260 |
+
file_count = 0
|
| 261 |
+
|
| 262 |
+
for root, dirs, files in os.walk(self.cache_dir):
|
| 263 |
+
for file in files:
|
| 264 |
+
file_path = os.path.join(root, file)
|
| 265 |
+
if os.path.exists(file_path):
|
| 266 |
+
total_size += os.path.getsize(file_path)
|
| 267 |
+
file_count += 1
|
| 268 |
+
|
| 269 |
+
return {
|
| 270 |
+
'total_size_bytes': total_size,
|
| 271 |
+
'total_size_mb': round(total_size / (1024 * 1024), 2),
|
| 272 |
+
'file_count': file_count,
|
| 273 |
+
'max_size_mb': self.max_size_bytes / (1024 * 1024),
|
| 274 |
+
'usage_percent': round((total_size / self.max_size_bytes) * 100, 2)
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
except Exception as e:
|
| 278 |
+
logger.error(f"Error getting cache size: {e}")
|
| 279 |
+
return {}
|
| 280 |
+
|
| 281 |
+
def _cleanup_cache(self):
|
| 282 |
+
"""Clean up cache if it exceeds maximum size."""
|
| 283 |
+
try:
|
| 284 |
+
cache_info = self.get_cache_size()
|
| 285 |
+
|
| 286 |
+
if cache_info.get('total_size_bytes', 0) > self.max_size_bytes:
|
| 287 |
+
logger.info("Cache size exceeded, cleaning up...")
|
| 288 |
+
|
| 289 |
+
# Get all cache files with their modification times
|
| 290 |
+
cache_files = []
|
| 291 |
+
for root, dirs, files in os.walk(self.cache_dir):
|
| 292 |
+
for file in files:
|
| 293 |
+
file_path = os.path.join(root, file)
|
| 294 |
+
if os.path.exists(file_path):
|
| 295 |
+
mtime = os.path.getmtime(file_path)
|
| 296 |
+
cache_files.append((file_path, mtime))
|
| 297 |
+
|
| 298 |
+
# Sort by modification time (oldest first)
|
| 299 |
+
cache_files.sort(key=lambda x: x[1])
|
| 300 |
+
|
| 301 |
+
# Delete oldest files until under limit
|
| 302 |
+
current_size = cache_info.get('total_size_bytes', 0)
|
| 303 |
+
target_size = self.max_size_bytes * 0.8 # Clean to 80% of max
|
| 304 |
+
|
| 305 |
+
for file_path, _ in cache_files:
|
| 306 |
+
if current_size <= target_size:
|
| 307 |
+
break
|
| 308 |
+
|
| 309 |
+
try:
|
| 310 |
+
file_size = os.path.getsize(file_path)
|
| 311 |
+
os.remove(file_path)
|
| 312 |
+
current_size -= file_size
|
| 313 |
+
logger.debug(f"Deleted cache file: {file_path}")
|
| 314 |
+
except Exception as e:
|
| 315 |
+
logger.error(f"Error deleting cache file {file_path}: {e}")
|
| 316 |
+
|
| 317 |
+
logger.info("Cache cleanup completed")
|
| 318 |
+
|
| 319 |
+
except Exception as e:
|
| 320 |
+
logger.error(f"Error during cache cleanup: {e}")
|
| 321 |
+
|
| 322 |
+
def clear_all_cache(self) -> bool:
|
| 323 |
+
"""
|
| 324 |
+
Clear all cached data.
|
| 325 |
+
|
| 326 |
+
Returns:
|
| 327 |
+
bool: True if successful, False otherwise
|
| 328 |
+
"""
|
| 329 |
+
try:
|
| 330 |
+
if self.cache_dir.exists():
|
| 331 |
+
shutil.rmtree(self.cache_dir)
|
| 332 |
+
self.ensure_cache_directory()
|
| 333 |
+
|
| 334 |
+
logger.info("All cache cleared")
|
| 335 |
+
return True
|
| 336 |
+
|
| 337 |
+
except Exception as e:
|
| 338 |
+
logger.error(f"Error clearing cache: {e}")
|
| 339 |
+
return False
|
| 340 |
+
|
| 341 |
+
def get_cached_videos(self) -> List[Dict[str, Any]]:
|
| 342 |
+
"""
|
| 343 |
+
Get list of cached videos.
|
| 344 |
+
|
| 345 |
+
Returns:
|
| 346 |
+
List[Dict[str, Any]]: List of cached video information
|
| 347 |
+
"""
|
| 348 |
+
try:
|
| 349 |
+
cached_videos = []
|
| 350 |
+
metadata_dir = self.cache_dir / "metadata"
|
| 351 |
+
|
| 352 |
+
if not metadata_dir.exists():
|
| 353 |
+
return cached_videos
|
| 354 |
+
|
| 355 |
+
for metadata_file in metadata_dir.glob("*.pkl"):
|
| 356 |
+
try:
|
| 357 |
+
with open(metadata_file, 'rb') as f:
|
| 358 |
+
metadata = pickle.load(f)
|
| 359 |
+
|
| 360 |
+
cached_videos.append({
|
| 361 |
+
'video_id': metadata.get('video_id'),
|
| 362 |
+
'cache_key': metadata.get('cache_key'),
|
| 363 |
+
'created_at': metadata.get('created_at'),
|
| 364 |
+
'file_size': metadata_file.stat().st_size
|
| 365 |
+
})
|
| 366 |
+
|
| 367 |
+
except Exception as e:
|
| 368 |
+
logger.error(f"Error reading metadata file {metadata_file}: {e}")
|
| 369 |
+
|
| 370 |
+
return cached_videos
|
| 371 |
+
|
| 372 |
+
except Exception as e:
|
| 373 |
+
logger.error(f"Error getting cached videos: {e}")
|
| 374 |
+
return []
|
src/src/utils/database.py
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Database utilities for storing processed videos and conversations.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import sqlite3
|
| 6 |
+
import json
|
| 7 |
+
import logging
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from typing import Dict, List, Any, Optional
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
class DatabaseManager:
|
| 15 |
+
"""Manages SQLite database operations for the chatbot."""
|
| 16 |
+
|
| 17 |
+
def __init__(self, db_path: str = "data/chatbot.db"):
|
| 18 |
+
"""
|
| 19 |
+
Initialize database manager.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
db_path (str): Path to SQLite database file
|
| 23 |
+
"""
|
| 24 |
+
self.db_path = db_path
|
| 25 |
+
self.ensure_db_directory()
|
| 26 |
+
self.init_database()
|
| 27 |
+
|
| 28 |
+
def ensure_db_directory(self):
|
| 29 |
+
"""Ensure database directory exists."""
|
| 30 |
+
db_dir = Path(self.db_path).parent
|
| 31 |
+
db_dir.mkdir(parents=True, exist_ok=True)
|
| 32 |
+
|
| 33 |
+
def init_database(self):
|
| 34 |
+
"""Initialize database tables."""
|
| 35 |
+
try:
|
| 36 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 37 |
+
cursor = conn.cursor()
|
| 38 |
+
|
| 39 |
+
# Videos table
|
| 40 |
+
cursor.execute('''
|
| 41 |
+
CREATE TABLE IF NOT EXISTS videos (
|
| 42 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 43 |
+
video_id TEXT UNIQUE NOT NULL,
|
| 44 |
+
url TEXT NOT NULL,
|
| 45 |
+
title TEXT,
|
| 46 |
+
author TEXT,
|
| 47 |
+
duration INTEGER,
|
| 48 |
+
views INTEGER,
|
| 49 |
+
publish_date TEXT,
|
| 50 |
+
thumbnail_url TEXT,
|
| 51 |
+
transcript TEXT,
|
| 52 |
+
metadata TEXT,
|
| 53 |
+
processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 54 |
+
language TEXT DEFAULT 'en'
|
| 55 |
+
)
|
| 56 |
+
''')
|
| 57 |
+
|
| 58 |
+
# Conversations table
|
| 59 |
+
cursor.execute('''
|
| 60 |
+
CREATE TABLE IF NOT EXISTS conversations (
|
| 61 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 62 |
+
conversation_id TEXT NOT NULL,
|
| 63 |
+
video_id TEXT,
|
| 64 |
+
question TEXT NOT NULL,
|
| 65 |
+
answer TEXT NOT NULL,
|
| 66 |
+
source_documents TEXT,
|
| 67 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 68 |
+
FOREIGN KEY (video_id) REFERENCES videos (video_id)
|
| 69 |
+
)
|
| 70 |
+
''')
|
| 71 |
+
|
| 72 |
+
# Vector stores table (for caching)
|
| 73 |
+
cursor.execute('''
|
| 74 |
+
CREATE TABLE IF NOT EXISTS vector_stores (
|
| 75 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 76 |
+
video_id TEXT UNIQUE NOT NULL,
|
| 77 |
+
vector_data BLOB,
|
| 78 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 79 |
+
FOREIGN KEY (video_id) REFERENCES videos (video_id)
|
| 80 |
+
)
|
| 81 |
+
''')
|
| 82 |
+
|
| 83 |
+
# User sessions table
|
| 84 |
+
cursor.execute('''
|
| 85 |
+
CREATE TABLE IF NOT EXISTS user_sessions (
|
| 86 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 87 |
+
session_id TEXT UNIQUE NOT NULL,
|
| 88 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 89 |
+
last_activity TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
| 90 |
+
metadata TEXT
|
| 91 |
+
)
|
| 92 |
+
''')
|
| 93 |
+
|
| 94 |
+
conn.commit()
|
| 95 |
+
logger.info("Database initialized successfully")
|
| 96 |
+
|
| 97 |
+
except Exception as e:
|
| 98 |
+
logger.error(f"Error initializing database: {e}")
|
| 99 |
+
raise
|
| 100 |
+
|
| 101 |
+
def save_video(self, video_data: Dict[str, Any]) -> bool:
|
| 102 |
+
"""
|
| 103 |
+
Save video information to database.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
video_data (Dict[str, Any]): Video data including metadata and transcript
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
bool: True if successful, False otherwise
|
| 110 |
+
"""
|
| 111 |
+
try:
|
| 112 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 113 |
+
cursor = conn.cursor()
|
| 114 |
+
|
| 115 |
+
cursor.execute('''
|
| 116 |
+
INSERT OR REPLACE INTO videos
|
| 117 |
+
(video_id, url, title, author, duration, views, publish_date,
|
| 118 |
+
thumbnail_url, transcript, metadata, language)
|
| 119 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 120 |
+
''', (
|
| 121 |
+
video_data.get('video_id'),
|
| 122 |
+
video_data.get('url'),
|
| 123 |
+
video_data.get('title'),
|
| 124 |
+
video_data.get('author'),
|
| 125 |
+
video_data.get('duration'),
|
| 126 |
+
video_data.get('views'),
|
| 127 |
+
video_data.get('publish_date'),
|
| 128 |
+
video_data.get('thumbnail_url'),
|
| 129 |
+
video_data.get('transcript'),
|
| 130 |
+
json.dumps(video_data.get('metadata', {})),
|
| 131 |
+
video_data.get('language', 'en')
|
| 132 |
+
))
|
| 133 |
+
|
| 134 |
+
conn.commit()
|
| 135 |
+
return True
|
| 136 |
+
|
| 137 |
+
except Exception as e:
|
| 138 |
+
logger.error(f"Error saving video: {e}")
|
| 139 |
+
return False
|
| 140 |
+
|
| 141 |
+
def get_video(self, video_id: str) -> Optional[Dict[str, Any]]:
|
| 142 |
+
"""
|
| 143 |
+
Get video information from database.
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
video_id (str): Video ID
|
| 147 |
+
|
| 148 |
+
Returns:
|
| 149 |
+
Optional[Dict[str, Any]]: Video data or None if not found
|
| 150 |
+
"""
|
| 151 |
+
try:
|
| 152 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 153 |
+
cursor = conn.cursor()
|
| 154 |
+
|
| 155 |
+
cursor.execute('''
|
| 156 |
+
SELECT video_id, url, title, author, duration, views,
|
| 157 |
+
publish_date, thumbnail_url, transcript, metadata,
|
| 158 |
+
processed_at, language
|
| 159 |
+
FROM videos WHERE video_id = ?
|
| 160 |
+
''', (video_id,))
|
| 161 |
+
|
| 162 |
+
row = cursor.fetchone()
|
| 163 |
+
if row:
|
| 164 |
+
return {
|
| 165 |
+
'video_id': row[0],
|
| 166 |
+
'url': row[1],
|
| 167 |
+
'title': row[2],
|
| 168 |
+
'author': row[3],
|
| 169 |
+
'duration': row[4],
|
| 170 |
+
'views': row[5],
|
| 171 |
+
'publish_date': row[6],
|
| 172 |
+
'thumbnail_url': row[7],
|
| 173 |
+
'transcript': row[8],
|
| 174 |
+
'metadata': json.loads(row[9]) if row[9] else {},
|
| 175 |
+
'processed_at': row[10],
|
| 176 |
+
'language': row[11]
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
except Exception as e:
|
| 180 |
+
logger.error(f"Error getting video: {e}")
|
| 181 |
+
|
| 182 |
+
return None
|
| 183 |
+
|
| 184 |
+
def save_conversation(self, conversation_data: Dict[str, Any]) -> bool:
|
| 185 |
+
"""
|
| 186 |
+
Save conversation entry to database.
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
conversation_data (Dict[str, Any]): Conversation data
|
| 190 |
+
|
| 191 |
+
Returns:
|
| 192 |
+
bool: True if successful, False otherwise
|
| 193 |
+
"""
|
| 194 |
+
try:
|
| 195 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 196 |
+
cursor = conn.cursor()
|
| 197 |
+
|
| 198 |
+
cursor.execute('''
|
| 199 |
+
INSERT INTO conversations
|
| 200 |
+
(conversation_id, video_id, question, answer, source_documents)
|
| 201 |
+
VALUES (?, ?, ?, ?, ?)
|
| 202 |
+
''', (
|
| 203 |
+
conversation_data.get('conversation_id'),
|
| 204 |
+
conversation_data.get('video_id'),
|
| 205 |
+
conversation_data.get('question'),
|
| 206 |
+
conversation_data.get('answer'),
|
| 207 |
+
json.dumps(conversation_data.get('source_documents', []))
|
| 208 |
+
))
|
| 209 |
+
|
| 210 |
+
conn.commit()
|
| 211 |
+
return True
|
| 212 |
+
|
| 213 |
+
except Exception as e:
|
| 214 |
+
logger.error(f"Error saving conversation: {e}")
|
| 215 |
+
return False
|
| 216 |
+
|
| 217 |
+
def get_conversations(self, video_id: str = None, conversation_id: str = None,
|
| 218 |
+
limit: int = 100) -> List[Dict[str, Any]]:
|
| 219 |
+
"""
|
| 220 |
+
Get conversations from database.
|
| 221 |
+
|
| 222 |
+
Args:
|
| 223 |
+
video_id (str): Optional video ID filter
|
| 224 |
+
conversation_id (str): Optional conversation ID filter
|
| 225 |
+
limit (int): Maximum number of results
|
| 226 |
+
|
| 227 |
+
Returns:
|
| 228 |
+
List[Dict[str, Any]]: List of conversations
|
| 229 |
+
"""
|
| 230 |
+
try:
|
| 231 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 232 |
+
cursor = conn.cursor()
|
| 233 |
+
|
| 234 |
+
query = '''
|
| 235 |
+
SELECT conversation_id, video_id, question, answer,
|
| 236 |
+
source_documents, created_at
|
| 237 |
+
FROM conversations
|
| 238 |
+
'''
|
| 239 |
+
params = []
|
| 240 |
+
|
| 241 |
+
conditions = []
|
| 242 |
+
if video_id:
|
| 243 |
+
conditions.append('video_id = ?')
|
| 244 |
+
params.append(video_id)
|
| 245 |
+
|
| 246 |
+
if conversation_id:
|
| 247 |
+
conditions.append('conversation_id = ?')
|
| 248 |
+
params.append(conversation_id)
|
| 249 |
+
|
| 250 |
+
if conditions:
|
| 251 |
+
query += ' WHERE ' + ' AND '.join(conditions)
|
| 252 |
+
|
| 253 |
+
query += ' ORDER BY created_at DESC LIMIT ?'
|
| 254 |
+
params.append(limit)
|
| 255 |
+
|
| 256 |
+
cursor.execute(query, params)
|
| 257 |
+
rows = cursor.fetchall()
|
| 258 |
+
|
| 259 |
+
conversations = []
|
| 260 |
+
for row in rows:
|
| 261 |
+
conversations.append({
|
| 262 |
+
'conversation_id': row[0],
|
| 263 |
+
'video_id': row[1],
|
| 264 |
+
'question': row[2],
|
| 265 |
+
'answer': row[3],
|
| 266 |
+
'source_documents': json.loads(row[4]) if row[4] else [],
|
| 267 |
+
'created_at': row[5]
|
| 268 |
+
})
|
| 269 |
+
|
| 270 |
+
return conversations
|
| 271 |
+
|
| 272 |
+
except Exception as e:
|
| 273 |
+
logger.error(f"Error getting conversations: {e}")
|
| 274 |
+
return []
|
| 275 |
+
|
| 276 |
+
def get_processed_videos(self, limit: int = 50) -> List[Dict[str, Any]]:
|
| 277 |
+
"""
|
| 278 |
+
Get list of processed videos.
|
| 279 |
+
|
| 280 |
+
Args:
|
| 281 |
+
limit (int): Maximum number of results
|
| 282 |
+
|
| 283 |
+
Returns:
|
| 284 |
+
List[Dict[str, Any]]: List of processed videos
|
| 285 |
+
"""
|
| 286 |
+
try:
|
| 287 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 288 |
+
cursor = conn.cursor()
|
| 289 |
+
|
| 290 |
+
cursor.execute('''
|
| 291 |
+
SELECT video_id, title, author, duration, processed_at
|
| 292 |
+
FROM videos
|
| 293 |
+
ORDER BY processed_at DESC
|
| 294 |
+
LIMIT ?
|
| 295 |
+
''', (limit,))
|
| 296 |
+
|
| 297 |
+
rows = cursor.fetchall()
|
| 298 |
+
|
| 299 |
+
videos = []
|
| 300 |
+
for row in rows:
|
| 301 |
+
videos.append({
|
| 302 |
+
'video_id': row[0],
|
| 303 |
+
'title': row[1],
|
| 304 |
+
'author': row[2],
|
| 305 |
+
'duration': row[3],
|
| 306 |
+
'processed_at': row[4]
|
| 307 |
+
})
|
| 308 |
+
|
| 309 |
+
return videos
|
| 310 |
+
|
| 311 |
+
except Exception as e:
|
| 312 |
+
logger.error(f"Error getting processed videos: {e}")
|
| 313 |
+
return []
|
| 314 |
+
|
| 315 |
+
def delete_video(self, video_id: str) -> bool:
|
| 316 |
+
"""
|
| 317 |
+
Delete video and associated conversations.
|
| 318 |
+
|
| 319 |
+
Args:
|
| 320 |
+
video_id (str): Video ID to delete
|
| 321 |
+
|
| 322 |
+
Returns:
|
| 323 |
+
bool: True if successful, False otherwise
|
| 324 |
+
"""
|
| 325 |
+
try:
|
| 326 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 327 |
+
cursor = conn.cursor()
|
| 328 |
+
|
| 329 |
+
# Delete conversations first (foreign key constraint)
|
| 330 |
+
cursor.execute('DELETE FROM conversations WHERE video_id = ?', (video_id,))
|
| 331 |
+
cursor.execute('DELETE FROM vector_stores WHERE video_id = ?', (video_id,))
|
| 332 |
+
cursor.execute('DELETE FROM videos WHERE video_id = ?', (video_id,))
|
| 333 |
+
|
| 334 |
+
conn.commit()
|
| 335 |
+
return True
|
| 336 |
+
|
| 337 |
+
except Exception as e:
|
| 338 |
+
logger.error(f"Error deleting video: {e}")
|
| 339 |
+
return False
|
| 340 |
+
|
| 341 |
+
def get_database_stats(self) -> Dict[str, Any]:
|
| 342 |
+
"""
|
| 343 |
+
Get database statistics.
|
| 344 |
+
|
| 345 |
+
Returns:
|
| 346 |
+
Dict[str, Any]: Database statistics
|
| 347 |
+
"""
|
| 348 |
+
try:
|
| 349 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 350 |
+
cursor = conn.cursor()
|
| 351 |
+
|
| 352 |
+
# Count videos
|
| 353 |
+
cursor.execute('SELECT COUNT(*) FROM videos')
|
| 354 |
+
video_count = cursor.fetchone()[0]
|
| 355 |
+
|
| 356 |
+
# Count conversations
|
| 357 |
+
cursor.execute('SELECT COUNT(*) FROM conversations')
|
| 358 |
+
conversation_count = cursor.fetchone()[0]
|
| 359 |
+
|
| 360 |
+
# Get database size
|
| 361 |
+
cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
|
| 362 |
+
db_size = cursor.fetchone()[0]
|
| 363 |
+
|
| 364 |
+
return {
|
| 365 |
+
'total_videos': video_count,
|
| 366 |
+
'total_conversations': conversation_count,
|
| 367 |
+
'database_size_bytes': db_size,
|
| 368 |
+
'database_size_mb': round(db_size / (1024 * 1024), 2)
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
except Exception as e:
|
| 372 |
+
logger.error(f"Error getting database stats: {e}")
|
| 373 |
+
return {}
|
src/src/utils/export_utils.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Export utilities for generating PDF, text, and other format exports.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import io
|
| 6 |
+
import json
|
| 7 |
+
import logging
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
from reportlab.lib.pagesizes import letter, A4
|
| 11 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
|
| 12 |
+
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 13 |
+
from reportlab.lib.units import inch
|
| 14 |
+
from reportlab.lib.colors import HexColor
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
class ExportUtils:
|
| 19 |
+
"""Utilities for exporting chat history and transcripts in various formats."""
|
| 20 |
+
|
| 21 |
+
def __init__(self):
|
| 22 |
+
self.styles = getSampleStyleSheet()
|
| 23 |
+
self.setup_custom_styles()
|
| 24 |
+
|
| 25 |
+
def setup_custom_styles(self):
|
| 26 |
+
"""Setup custom styles for PDF generation."""
|
| 27 |
+
self.styles.add(ParagraphStyle(
|
| 28 |
+
name='CustomTitle',
|
| 29 |
+
parent=self.styles['Heading1'],
|
| 30 |
+
fontSize=16,
|
| 31 |
+
spaceAfter=30,
|
| 32 |
+
textColor=HexColor('#2E86AB')
|
| 33 |
+
))
|
| 34 |
+
|
| 35 |
+
self.styles.add(ParagraphStyle(
|
| 36 |
+
name='QuestionStyle',
|
| 37 |
+
parent=self.styles['Normal'],
|
| 38 |
+
fontSize=12,
|
| 39 |
+
spaceAfter=10,
|
| 40 |
+
textColor=HexColor('#A23B72'),
|
| 41 |
+
leftIndent=20
|
| 42 |
+
))
|
| 43 |
+
|
| 44 |
+
self.styles.add(ParagraphStyle(
|
| 45 |
+
name='AnswerStyle',
|
| 46 |
+
parent=self.styles['Normal'],
|
| 47 |
+
fontSize=11,
|
| 48 |
+
spaceAfter=20,
|
| 49 |
+
leftIndent=40
|
| 50 |
+
))
|
| 51 |
+
|
| 52 |
+
def export_to_pdf(self, chat_history: List[Dict[str, Any]],
|
| 53 |
+
video_metadata: Dict[str, Any] = None) -> bytes:
|
| 54 |
+
"""
|
| 55 |
+
Export chat history to PDF format.
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
chat_history (List[Dict[str, Any]]): Chat history entries
|
| 59 |
+
video_metadata (Dict[str, Any]): Video metadata
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
bytes: PDF content as bytes
|
| 63 |
+
"""
|
| 64 |
+
try:
|
| 65 |
+
buffer = io.BytesIO()
|
| 66 |
+
doc = SimpleDocTemplate(buffer, pagesize=A4)
|
| 67 |
+
story = []
|
| 68 |
+
|
| 69 |
+
# Title
|
| 70 |
+
title = "YouTube Transcript Q&A Session"
|
| 71 |
+
story.append(Paragraph(title, self.styles['CustomTitle']))
|
| 72 |
+
story.append(Spacer(1, 12))
|
| 73 |
+
|
| 74 |
+
# Video information
|
| 75 |
+
if video_metadata:
|
| 76 |
+
story.append(Paragraph("Video Information", self.styles['Heading2']))
|
| 77 |
+
story.append(Paragraph(f"<b>Title:</b> {video_metadata.get('title', 'N/A')}",
|
| 78 |
+
self.styles['Normal']))
|
| 79 |
+
story.append(Paragraph(f"<b>Author:</b> {video_metadata.get('author', 'N/A')}",
|
| 80 |
+
self.styles['Normal']))
|
| 81 |
+
story.append(Paragraph(f"<b>Duration:</b> {self._format_duration(video_metadata.get('length', 0))}",
|
| 82 |
+
self.styles['Normal']))
|
| 83 |
+
story.append(Spacer(1, 20))
|
| 84 |
+
|
| 85 |
+
# Export information
|
| 86 |
+
story.append(Paragraph(f"<b>Exported on:</b> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
| 87 |
+
self.styles['Normal']))
|
| 88 |
+
story.append(Paragraph(f"<b>Total Questions:</b> {len(chat_history)}",
|
| 89 |
+
self.styles['Normal']))
|
| 90 |
+
story.append(Spacer(1, 20))
|
| 91 |
+
|
| 92 |
+
# Chat history
|
| 93 |
+
story.append(Paragraph("Questions and Answers", self.styles['Heading2']))
|
| 94 |
+
story.append(Spacer(1, 12))
|
| 95 |
+
|
| 96 |
+
for i, entry in enumerate(chat_history, 1):
|
| 97 |
+
# Question
|
| 98 |
+
story.append(Paragraph(f"<b>Q{i}:</b> {entry['question']}",
|
| 99 |
+
self.styles['QuestionStyle']))
|
| 100 |
+
|
| 101 |
+
# Answer
|
| 102 |
+
story.append(Paragraph(f"<b>A{i}:</b> {entry['answer']}",
|
| 103 |
+
self.styles['AnswerStyle']))
|
| 104 |
+
|
| 105 |
+
# Timestamp
|
| 106 |
+
timestamp = datetime.fromisoformat(entry['timestamp']).strftime('%Y-%m-%d %H:%M:%S')
|
| 107 |
+
story.append(Paragraph(f"<i>Asked on: {timestamp}</i>",
|
| 108 |
+
self.styles['Normal']))
|
| 109 |
+
story.append(Spacer(1, 15))
|
| 110 |
+
|
| 111 |
+
doc.build(story)
|
| 112 |
+
buffer.seek(0)
|
| 113 |
+
return buffer.getvalue()
|
| 114 |
+
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.error(f"Error generating PDF: {e}")
|
| 117 |
+
return b""
|
| 118 |
+
|
| 119 |
+
def export_to_text(self, chat_history: List[Dict[str, Any]],
|
| 120 |
+
video_metadata: Dict[str, Any] = None) -> str:
|
| 121 |
+
"""
|
| 122 |
+
Export chat history to plain text format.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
chat_history (List[Dict[str, Any]]): Chat history entries
|
| 126 |
+
video_metadata (Dict[str, Any]): Video metadata
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
str: Text content
|
| 130 |
+
"""
|
| 131 |
+
try:
|
| 132 |
+
lines = []
|
| 133 |
+
lines.append("YouTube Transcript Q&A Session")
|
| 134 |
+
lines.append("=" * 50)
|
| 135 |
+
lines.append("")
|
| 136 |
+
|
| 137 |
+
# Video information
|
| 138 |
+
if video_metadata:
|
| 139 |
+
lines.append("Video Information:")
|
| 140 |
+
lines.append(f"Title: {video_metadata.get('title', 'N/A')}")
|
| 141 |
+
lines.append(f"Author: {video_metadata.get('author', 'N/A')}")
|
| 142 |
+
lines.append(f"Duration: {self._format_duration(video_metadata.get('length', 0))}")
|
| 143 |
+
lines.append("")
|
| 144 |
+
|
| 145 |
+
# Export information
|
| 146 |
+
lines.append(f"Exported on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 147 |
+
lines.append(f"Total Questions: {len(chat_history)}")
|
| 148 |
+
lines.append("")
|
| 149 |
+
lines.append("Questions and Answers:")
|
| 150 |
+
lines.append("-" * 30)
|
| 151 |
+
lines.append("")
|
| 152 |
+
|
| 153 |
+
# Chat history
|
| 154 |
+
for i, entry in enumerate(chat_history, 1):
|
| 155 |
+
timestamp = datetime.fromisoformat(entry['timestamp']).strftime('%Y-%m-%d %H:%M:%S')
|
| 156 |
+
lines.append(f"Q{i}: {entry['question']}")
|
| 157 |
+
lines.append(f"A{i}: {entry['answer']}")
|
| 158 |
+
lines.append(f"Asked on: {timestamp}")
|
| 159 |
+
lines.append("")
|
| 160 |
+
lines.append("-" * 30)
|
| 161 |
+
lines.append("")
|
| 162 |
+
|
| 163 |
+
return "\n".join(lines)
|
| 164 |
+
|
| 165 |
+
except Exception as e:
|
| 166 |
+
logger.error(f"Error generating text export: {e}")
|
| 167 |
+
return ""
|
| 168 |
+
|
| 169 |
+
def export_to_json(self, chat_history: List[Dict[str, Any]],
|
| 170 |
+
video_metadata: Dict[str, Any] = None) -> str:
|
| 171 |
+
"""
|
| 172 |
+
Export chat history to JSON format.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
chat_history (List[Dict[str, Any]]): Chat history entries
|
| 176 |
+
video_metadata (Dict[str, Any]): Video metadata
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
str: JSON content
|
| 180 |
+
"""
|
| 181 |
+
try:
|
| 182 |
+
export_data = {
|
| 183 |
+
'export_info': {
|
| 184 |
+
'exported_at': datetime.now().isoformat(),
|
| 185 |
+
'total_questions': len(chat_history),
|
| 186 |
+
'format_version': '1.0'
|
| 187 |
+
},
|
| 188 |
+
'video_metadata': video_metadata or {},
|
| 189 |
+
'chat_history': chat_history
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
return json.dumps(export_data, indent=2, ensure_ascii=False)
|
| 193 |
+
|
| 194 |
+
except Exception as e:
|
| 195 |
+
logger.error(f"Error generating JSON export: {e}")
|
| 196 |
+
return ""
|
| 197 |
+
|
| 198 |
+
def export_transcript(self, transcript_text: str, video_metadata: Dict[str, Any] = None,
|
| 199 |
+
format: str = 'txt') -> str:
|
| 200 |
+
"""
|
| 201 |
+
Export transcript in specified format.
|
| 202 |
+
|
| 203 |
+
Args:
|
| 204 |
+
transcript_text (str): Transcript text
|
| 205 |
+
video_metadata (Dict[str, Any]): Video metadata
|
| 206 |
+
format (str): Export format ('txt', 'json')
|
| 207 |
+
|
| 208 |
+
Returns:
|
| 209 |
+
str: Exported transcript
|
| 210 |
+
"""
|
| 211 |
+
try:
|
| 212 |
+
if format == 'txt':
|
| 213 |
+
lines = []
|
| 214 |
+
lines.append("YouTube Video Transcript")
|
| 215 |
+
lines.append("=" * 30)
|
| 216 |
+
lines.append("")
|
| 217 |
+
|
| 218 |
+
if video_metadata:
|
| 219 |
+
lines.append(f"Title: {video_metadata.get('title', 'N/A')}")
|
| 220 |
+
lines.append(f"Author: {video_metadata.get('author', 'N/A')}")
|
| 221 |
+
lines.append(f"Duration: {self._format_duration(video_metadata.get('length', 0))}")
|
| 222 |
+
lines.append("")
|
| 223 |
+
|
| 224 |
+
lines.append(f"Exported on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 225 |
+
lines.append("")
|
| 226 |
+
lines.append("Transcript:")
|
| 227 |
+
lines.append("-" * 20)
|
| 228 |
+
lines.append("")
|
| 229 |
+
lines.append(transcript_text)
|
| 230 |
+
|
| 231 |
+
return "\n".join(lines)
|
| 232 |
+
|
| 233 |
+
elif format == 'json':
|
| 234 |
+
export_data = {
|
| 235 |
+
'export_info': {
|
| 236 |
+
'exported_at': datetime.now().isoformat(),
|
| 237 |
+
'format_version': '1.0'
|
| 238 |
+
},
|
| 239 |
+
'video_metadata': video_metadata or {},
|
| 240 |
+
'transcript': transcript_text
|
| 241 |
+
}
|
| 242 |
+
return json.dumps(export_data, indent=2, ensure_ascii=False)
|
| 243 |
+
|
| 244 |
+
return transcript_text
|
| 245 |
+
|
| 246 |
+
except Exception as e:
|
| 247 |
+
logger.error(f"Error exporting transcript: {e}")
|
| 248 |
+
return ""
|
| 249 |
+
|
| 250 |
+
def _format_duration(self, seconds: int) -> str:
|
| 251 |
+
"""Format duration from seconds to HH:MM:SS format."""
|
| 252 |
+
if not seconds:
|
| 253 |
+
return "N/A"
|
| 254 |
+
|
| 255 |
+
hours = seconds // 3600
|
| 256 |
+
minutes = (seconds % 3600) // 60
|
| 257 |
+
seconds = seconds % 60
|
| 258 |
+
|
| 259 |
+
if hours > 0:
|
| 260 |
+
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
|
| 261 |
+
else:
|
| 262 |
+
return f"{minutes:02d}:{seconds:02d}"
|
src/src/utils/logger.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Logging configuration and utilities.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
import os
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from logging.handlers import RotatingFileHandler
|
| 9 |
+
|
| 10 |
+
def setup_logging(log_level: str = "INFO", log_file: str = None) -> logging.Logger:
|
| 11 |
+
"""
|
| 12 |
+
Setup logging configuration.
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
log_level (str): Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
| 16 |
+
log_file (str): Optional log file path
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
logging.Logger: Configured logger
|
| 20 |
+
"""
|
| 21 |
+
# Create logs directory if it doesn't exist
|
| 22 |
+
if log_file:
|
| 23 |
+
os.makedirs(os.path.dirname(log_file), exist_ok=True)
|
| 24 |
+
else:
|
| 25 |
+
os.makedirs('logs', exist_ok=True)
|
| 26 |
+
log_file = f'logs/youtube_chatbot_{datetime.now().strftime("%Y%m%d")}.log'
|
| 27 |
+
|
| 28 |
+
# Configure logging
|
| 29 |
+
logging.basicConfig(
|
| 30 |
+
level=getattr(logging, log_level.upper()),
|
| 31 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 32 |
+
handlers=[
|
| 33 |
+
logging.StreamHandler(), # Console output
|
| 34 |
+
RotatingFileHandler(
|
| 35 |
+
log_file,
|
| 36 |
+
maxBytes=10*1024*1024, # 10MB
|
| 37 |
+
backupCount=5
|
| 38 |
+
)
|
| 39 |
+
]
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
return logging.getLogger(__name__)
|
| 43 |
+
|
| 44 |
+
def get_logger(name: str) -> logging.Logger:
|
| 45 |
+
"""Get a logger with the specified name."""
|
| 46 |
+
return logging.getLogger(name)
|
src/src/utils/session_manager.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Session management utilities for handling chat history and application state.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from typing import Dict, List, Any, Optional
|
| 9 |
+
import streamlit as st
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
class SessionManager:
|
| 14 |
+
"""Manages session state, chat history, and conversation persistence."""
|
| 15 |
+
|
| 16 |
+
def __init__(self):
|
| 17 |
+
self.initialize_session_state()
|
| 18 |
+
|
| 19 |
+
def initialize_session_state(self):
|
| 20 |
+
"""Initialize Streamlit session state variables."""
|
| 21 |
+
if 'chat_history' not in st.session_state:
|
| 22 |
+
st.session_state.chat_history = []
|
| 23 |
+
|
| 24 |
+
if 'processed_videos' not in st.session_state:
|
| 25 |
+
st.session_state.processed_videos = {}
|
| 26 |
+
|
| 27 |
+
if 'current_video' not in st.session_state:
|
| 28 |
+
st.session_state.current_video = None
|
| 29 |
+
|
| 30 |
+
if 'qa_chain' not in st.session_state:
|
| 31 |
+
st.session_state.qa_chain = None
|
| 32 |
+
|
| 33 |
+
if 'vectorstore' not in st.session_state:
|
| 34 |
+
st.session_state.vectorstore = None
|
| 35 |
+
|
| 36 |
+
if 'video_metadata' not in st.session_state:
|
| 37 |
+
st.session_state.video_metadata = {}
|
| 38 |
+
|
| 39 |
+
if 'conversation_id' not in st.session_state:
|
| 40 |
+
st.session_state.conversation_id = self.generate_conversation_id()
|
| 41 |
+
|
| 42 |
+
def generate_conversation_id(self) -> str:
|
| 43 |
+
"""Generate a unique conversation ID."""
|
| 44 |
+
return f"conv_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
| 45 |
+
|
| 46 |
+
def add_to_chat_history(self, question: str, answer: str, video_id: str = None,
|
| 47 |
+
source_docs: List[Any] = None):
|
| 48 |
+
"""
|
| 49 |
+
Add a Q&A pair to chat history.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
question (str): User question
|
| 53 |
+
answer (str): AI answer
|
| 54 |
+
video_id (str): Associated video ID
|
| 55 |
+
source_docs (List[Any]): Source documents used for answer
|
| 56 |
+
"""
|
| 57 |
+
chat_entry = {
|
| 58 |
+
'timestamp': datetime.now().isoformat(),
|
| 59 |
+
'question': question,
|
| 60 |
+
'answer': answer,
|
| 61 |
+
'video_id': video_id,
|
| 62 |
+
'source_docs': [doc.page_content[:200] + "..." if len(doc.page_content) > 200
|
| 63 |
+
else doc.page_content for doc in (source_docs or [])],
|
| 64 |
+
'conversation_id': st.session_state.conversation_id
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
st.session_state.chat_history.append(chat_entry)
|
| 68 |
+
|
| 69 |
+
def get_chat_history(self, video_id: str = None) -> List[Dict[str, Any]]:
|
| 70 |
+
"""
|
| 71 |
+
Get chat history, optionally filtered by video ID.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
video_id (str): Optional video ID to filter by
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
List[Dict[str, Any]]: Chat history entries
|
| 78 |
+
"""
|
| 79 |
+
if video_id:
|
| 80 |
+
return [entry for entry in st.session_state.chat_history
|
| 81 |
+
if entry.get('video_id') == video_id]
|
| 82 |
+
return st.session_state.chat_history
|
| 83 |
+
|
| 84 |
+
def clear_chat_history(self, video_id: str = None):
|
| 85 |
+
"""
|
| 86 |
+
Clear chat history, optionally for a specific video.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
video_id (str): Optional video ID to clear history for
|
| 90 |
+
"""
|
| 91 |
+
if video_id:
|
| 92 |
+
st.session_state.chat_history = [
|
| 93 |
+
entry for entry in st.session_state.chat_history
|
| 94 |
+
if entry.get('video_id') != video_id
|
| 95 |
+
]
|
| 96 |
+
else:
|
| 97 |
+
st.session_state.chat_history = []
|
| 98 |
+
|
| 99 |
+
def save_processed_video(self, video_url: str, video_id: str, metadata: Dict[str, Any],
|
| 100 |
+
transcript: str, qa_chain: Any, vectorstore: Any):
|
| 101 |
+
"""
|
| 102 |
+
Save processed video information to session state.
|
| 103 |
+
|
| 104 |
+
Args:
|
| 105 |
+
video_url (str): Video URL
|
| 106 |
+
video_id (str): Video ID
|
| 107 |
+
metadata (Dict[str, Any]): Video metadata
|
| 108 |
+
transcript (str): Video transcript
|
| 109 |
+
qa_chain (Any): QA chain object
|
| 110 |
+
vectorstore (Any): Vector store object
|
| 111 |
+
"""
|
| 112 |
+
st.session_state.processed_videos[video_id] = {
|
| 113 |
+
'url': video_url,
|
| 114 |
+
'metadata': metadata,
|
| 115 |
+
'transcript': transcript,
|
| 116 |
+
'processed_at': datetime.now().isoformat(),
|
| 117 |
+
'conversation_id': st.session_state.conversation_id
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
st.session_state.current_video = video_id
|
| 121 |
+
st.session_state.qa_chain = qa_chain
|
| 122 |
+
st.session_state.vectorstore = vectorstore
|
| 123 |
+
st.session_state.video_metadata = metadata
|
| 124 |
+
|
| 125 |
+
def get_processed_videos(self) -> Dict[str, Dict[str, Any]]:
|
| 126 |
+
"""Get all processed videos."""
|
| 127 |
+
return st.session_state.processed_videos
|
| 128 |
+
|
| 129 |
+
def switch_to_video(self, video_id: str) -> bool:
|
| 130 |
+
"""
|
| 131 |
+
Switch to a previously processed video.
|
| 132 |
+
|
| 133 |
+
Args:
|
| 134 |
+
video_id (str): Video ID to switch to
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
bool: True if successful, False if video not found
|
| 138 |
+
"""
|
| 139 |
+
if video_id in st.session_state.processed_videos:
|
| 140 |
+
st.session_state.current_video = video_id
|
| 141 |
+
# Note: QA chain and vectorstore would need to be recreated
|
| 142 |
+
# This is a simplified version - in a full implementation,
|
| 143 |
+
# you'd want to persist and reload these objects
|
| 144 |
+
return True
|
| 145 |
+
return False
|
| 146 |
+
|
| 147 |
+
def export_chat_history(self, format: str = 'json') -> str:
|
| 148 |
+
"""
|
| 149 |
+
Export chat history in specified format.
|
| 150 |
+
|
| 151 |
+
Args:
|
| 152 |
+
format (str): Export format ('json', 'txt')
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
str: Exported chat history
|
| 156 |
+
"""
|
| 157 |
+
if format == 'json':
|
| 158 |
+
return json.dumps(st.session_state.chat_history, indent=2)
|
| 159 |
+
|
| 160 |
+
elif format == 'txt':
|
| 161 |
+
output = []
|
| 162 |
+
for entry in st.session_state.chat_history:
|
| 163 |
+
output.append(f"Timestamp: {entry['timestamp']}")
|
| 164 |
+
output.append(f"Question: {entry['question']}")
|
| 165 |
+
output.append(f"Answer: {entry['answer']}")
|
| 166 |
+
if entry.get('video_id'):
|
| 167 |
+
output.append(f"Video ID: {entry['video_id']}")
|
| 168 |
+
output.append("-" * 50)
|
| 169 |
+
return "\n".join(output)
|
| 170 |
+
|
| 171 |
+
return ""
|
| 172 |
+
|
| 173 |
+
def get_session_stats(self) -> Dict[str, Any]:
|
| 174 |
+
"""Get session statistics."""
|
| 175 |
+
return {
|
| 176 |
+
'total_questions': len(st.session_state.chat_history),
|
| 177 |
+
'processed_videos': len(st.session_state.processed_videos),
|
| 178 |
+
'current_video': st.session_state.current_video,
|
| 179 |
+
'conversation_id': st.session_state.conversation_id,
|
| 180 |
+
'session_start': min([entry['timestamp'] for entry in st.session_state.chat_history],
|
| 181 |
+
default=datetime.now().isoformat())
|
| 182 |
+
}
|
src/src/utils/text_processor.py
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text processing utilities for document handling and vector store operations.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import logging
|
| 7 |
+
from typing import List, Optional, Dict, Any
|
| 8 |
+
try:
|
| 9 |
+
from langchain_openai import OpenAIEmbeddings, OpenAI
|
| 10 |
+
except ImportError:
|
| 11 |
+
from langchain_community.embeddings import OpenAIEmbeddings
|
| 12 |
+
from langchain_community.llms import OpenAI
|
| 13 |
+
from langchain_community.vectorstores import FAISS
|
| 14 |
+
from langchain_community.document_loaders import TextLoader
|
| 15 |
+
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
| 16 |
+
from langchain.chains import RetrievalQA
|
| 17 |
+
from langchain.docstore.document import Document
|
| 18 |
+
import pickle
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
class SimpleVectorStore:
|
| 23 |
+
"""
|
| 24 |
+
Simple text-based vector store that works without embeddings.
|
| 25 |
+
Uses basic text search and keyword matching.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(self, documents: List[Document]):
|
| 29 |
+
self.documents = documents
|
| 30 |
+
self.texts = [doc.page_content for doc in documents]
|
| 31 |
+
|
| 32 |
+
def as_retriever(self, search_type: str = "similarity", search_kwargs: dict = None):
|
| 33 |
+
"""Return a simple retriever."""
|
| 34 |
+
return SimpleRetriever(self.documents, search_kwargs or {})
|
| 35 |
+
|
| 36 |
+
class SimpleRetriever:
|
| 37 |
+
"""Simple text-based retriever."""
|
| 38 |
+
|
| 39 |
+
def __init__(self, documents: List[Document], search_kwargs: dict):
|
| 40 |
+
self.documents = documents
|
| 41 |
+
self.k = search_kwargs.get('k', 4)
|
| 42 |
+
|
| 43 |
+
def get_relevant_documents(self, query: str) -> List[Document]:
|
| 44 |
+
"""Get relevant documents using simple text matching."""
|
| 45 |
+
query_words = query.lower().split()
|
| 46 |
+
scored_docs = []
|
| 47 |
+
|
| 48 |
+
for doc in self.documents:
|
| 49 |
+
content = doc.page_content.lower()
|
| 50 |
+
score = sum(1 for word in query_words if word in content)
|
| 51 |
+
if score > 0:
|
| 52 |
+
scored_docs.append((doc, score))
|
| 53 |
+
|
| 54 |
+
# Sort by score and return top k
|
| 55 |
+
scored_docs.sort(key=lambda x: x[1], reverse=True)
|
| 56 |
+
return [doc for doc, _ in scored_docs[:self.k]]
|
| 57 |
+
|
| 58 |
+
class FallbackQAChain:
|
| 59 |
+
"""
|
| 60 |
+
Fallback QA chain that works without OpenAI API.
|
| 61 |
+
Provides basic text search and simple answers.
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
def __init__(self, vectorstore):
|
| 65 |
+
self.vectorstore = vectorstore
|
| 66 |
+
self.documents = []
|
| 67 |
+
|
| 68 |
+
# Extract documents from vectorstore
|
| 69 |
+
try:
|
| 70 |
+
if isinstance(vectorstore, SimpleVectorStore):
|
| 71 |
+
self.documents = vectorstore.documents
|
| 72 |
+
elif hasattr(vectorstore, 'docstore') and hasattr(vectorstore.docstore, '_dict'):
|
| 73 |
+
self.documents = list(vectorstore.docstore._dict.values())
|
| 74 |
+
except:
|
| 75 |
+
pass
|
| 76 |
+
|
| 77 |
+
def __call__(self, inputs: Dict[str, str]) -> Dict[str, Any]:
|
| 78 |
+
"""
|
| 79 |
+
Process a query and return an answer.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
inputs (Dict[str, str]): Input dictionary with 'query' key
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
Dict[str, Any]: Result dictionary with 'result' and 'source_documents'
|
| 86 |
+
"""
|
| 87 |
+
query = inputs.get('query', '').lower()
|
| 88 |
+
|
| 89 |
+
# Simple keyword-based search
|
| 90 |
+
relevant_docs = []
|
| 91 |
+
for doc in self.documents:
|
| 92 |
+
if hasattr(doc, 'page_content'):
|
| 93 |
+
content = doc.page_content.lower()
|
| 94 |
+
# Simple relevance scoring based on keyword matches
|
| 95 |
+
query_words = query.split()
|
| 96 |
+
matches = sum(1 for word in query_words if word in content)
|
| 97 |
+
if matches > 0:
|
| 98 |
+
relevant_docs.append((doc, matches))
|
| 99 |
+
|
| 100 |
+
# Sort by relevance and take top results
|
| 101 |
+
relevant_docs.sort(key=lambda x: x[1], reverse=True)
|
| 102 |
+
top_docs = [doc for doc, _ in relevant_docs[:3]]
|
| 103 |
+
|
| 104 |
+
# Generate simple answer
|
| 105 |
+
if top_docs:
|
| 106 |
+
# Combine relevant text
|
| 107 |
+
combined_text = " ".join([doc.page_content[:200] for doc in top_docs])
|
| 108 |
+
answer = f"Based on the transcript, here's what I found: {combined_text[:500]}..."
|
| 109 |
+
else:
|
| 110 |
+
answer = "I couldn't find specific information about that in the transcript. Please try rephrasing your question or ask about different topics covered in the video."
|
| 111 |
+
|
| 112 |
+
return {
|
| 113 |
+
'result': answer,
|
| 114 |
+
'source_documents': top_docs
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
class TextProcessor:
|
| 118 |
+
"""Handles text processing, document splitting, and vector store operations."""
|
| 119 |
+
|
| 120 |
+
def __init__(self, openai_api_key: str):
|
| 121 |
+
"""
|
| 122 |
+
Initialize TextProcessor with OpenAI API key.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
openai_api_key (str): OpenAI API key
|
| 126 |
+
"""
|
| 127 |
+
self.openai_api_key = openai_api_key
|
| 128 |
+
self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
|
| 129 |
+
self.llm = OpenAI(openai_api_key=openai_api_key, temperature=0.7)
|
| 130 |
+
|
| 131 |
+
def create_documents_from_text(self, text: str, metadata: Dict[str, Any] = None) -> List[Document]:
|
| 132 |
+
"""
|
| 133 |
+
Create LangChain documents from text with metadata.
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
text (str): Input text
|
| 137 |
+
metadata (Dict[str, Any]): Document metadata
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
List[Document]: List of LangChain documents
|
| 141 |
+
"""
|
| 142 |
+
if metadata is None:
|
| 143 |
+
metadata = {}
|
| 144 |
+
|
| 145 |
+
# Use RecursiveCharacterTextSplitter for better text splitting
|
| 146 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 147 |
+
chunk_size=1000,
|
| 148 |
+
chunk_overlap=200,
|
| 149 |
+
length_function=len,
|
| 150 |
+
separators=["\n\n", "\n", " ", ""]
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# Create a document and split it
|
| 154 |
+
doc = Document(page_content=text, metadata=metadata)
|
| 155 |
+
docs = text_splitter.split_documents([doc])
|
| 156 |
+
|
| 157 |
+
return docs
|
| 158 |
+
|
| 159 |
+
def create_vector_store(self, documents: List[Document]) -> Optional[FAISS]:
|
| 160 |
+
"""
|
| 161 |
+
Create FAISS vector store from documents with fallback options.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
documents (List[Document]): List of documents
|
| 165 |
+
|
| 166 |
+
Returns:
|
| 167 |
+
Optional[FAISS]: FAISS vector store or None if failed
|
| 168 |
+
"""
|
| 169 |
+
try:
|
| 170 |
+
if not documents:
|
| 171 |
+
logger.error("No documents provided for vector store creation")
|
| 172 |
+
return None
|
| 173 |
+
|
| 174 |
+
# Try with OpenAI embeddings first
|
| 175 |
+
try:
|
| 176 |
+
vectorstore = FAISS.from_documents(documents, self.embeddings)
|
| 177 |
+
return vectorstore
|
| 178 |
+
except Exception as openai_error:
|
| 179 |
+
logger.warning(f"OpenAI embeddings failed: {openai_error}")
|
| 180 |
+
|
| 181 |
+
# Fallback to simple text-based search
|
| 182 |
+
logger.info("Using simple text-based fallback")
|
| 183 |
+
return self._create_simple_fallback_store(documents)
|
| 184 |
+
|
| 185 |
+
except Exception as e:
|
| 186 |
+
logger.error(f"Error creating vector store: {e}")
|
| 187 |
+
return None
|
| 188 |
+
|
| 189 |
+
def _create_simple_fallback_store(self, documents: List[Document]) -> Optional['SimpleVectorStore']:
|
| 190 |
+
"""
|
| 191 |
+
Create a simple fallback vector store using basic text search.
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
documents (List[Document]): List of documents
|
| 195 |
+
|
| 196 |
+
Returns:
|
| 197 |
+
Optional[SimpleVectorStore]: Simple vector store or None if failed
|
| 198 |
+
"""
|
| 199 |
+
try:
|
| 200 |
+
# Create simple text-based vector store
|
| 201 |
+
simple_store = SimpleVectorStore(documents)
|
| 202 |
+
logger.info("Created simple text-based fallback vector store")
|
| 203 |
+
return simple_store
|
| 204 |
+
except Exception as e:
|
| 205 |
+
logger.error(f"Even fallback vector store creation failed: {e}")
|
| 206 |
+
return None
|
| 207 |
+
|
| 208 |
+
def save_vector_store(self, vectorstore: FAISS, path: str) -> bool:
|
| 209 |
+
"""
|
| 210 |
+
Save vector store to disk.
|
| 211 |
+
|
| 212 |
+
Args:
|
| 213 |
+
vectorstore (FAISS): Vector store to save
|
| 214 |
+
path (str): Path to save the vector store
|
| 215 |
+
|
| 216 |
+
Returns:
|
| 217 |
+
bool: True if successful, False otherwise
|
| 218 |
+
"""
|
| 219 |
+
try:
|
| 220 |
+
os.makedirs(os.path.dirname(path) if os.path.dirname(path) else '.', exist_ok=True)
|
| 221 |
+
vectorstore.save_local(path)
|
| 222 |
+
return True
|
| 223 |
+
except Exception as e:
|
| 224 |
+
logger.error(f"Error saving vector store: {e}")
|
| 225 |
+
return False
|
| 226 |
+
|
| 227 |
+
def load_vector_store(self, path: str) -> Optional[FAISS]:
|
| 228 |
+
"""
|
| 229 |
+
Load vector store from disk.
|
| 230 |
+
|
| 231 |
+
Args:
|
| 232 |
+
path (str): Path to load the vector store from
|
| 233 |
+
|
| 234 |
+
Returns:
|
| 235 |
+
Optional[FAISS]: Loaded vector store or None if failed
|
| 236 |
+
"""
|
| 237 |
+
try:
|
| 238 |
+
if not os.path.exists(path):
|
| 239 |
+
logger.error(f"Vector store path does not exist: {path}")
|
| 240 |
+
return None
|
| 241 |
+
|
| 242 |
+
vectorstore = FAISS.load_local(path, self.embeddings)
|
| 243 |
+
return vectorstore
|
| 244 |
+
except Exception as e:
|
| 245 |
+
logger.error(f"Error loading vector store: {e}")
|
| 246 |
+
return None
|
| 247 |
+
|
| 248 |
+
def create_qa_chain(self, vectorstore, chain_type: str = "stuff") -> Optional[RetrievalQA]:
|
| 249 |
+
"""
|
| 250 |
+
Create QA chain from vector store with fallback options.
|
| 251 |
+
|
| 252 |
+
Args:
|
| 253 |
+
vectorstore: Vector store (FAISS or SimpleVectorStore)
|
| 254 |
+
chain_type (str): Type of chain to create
|
| 255 |
+
|
| 256 |
+
Returns:
|
| 257 |
+
Optional[RetrievalQA]: QA chain or None if failed
|
| 258 |
+
"""
|
| 259 |
+
try:
|
| 260 |
+
# Check if it's a simple vector store (fallback mode)
|
| 261 |
+
if isinstance(vectorstore, SimpleVectorStore):
|
| 262 |
+
logger.info("Using simple fallback QA system")
|
| 263 |
+
return FallbackQAChain(vectorstore)
|
| 264 |
+
|
| 265 |
+
retriever = vectorstore.as_retriever(
|
| 266 |
+
search_type="similarity",
|
| 267 |
+
search_kwargs={"k": 4}
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
# Try with OpenAI LLM first
|
| 271 |
+
try:
|
| 272 |
+
qa_chain = RetrievalQA.from_chain_type(
|
| 273 |
+
llm=self.llm,
|
| 274 |
+
chain_type=chain_type,
|
| 275 |
+
retriever=retriever,
|
| 276 |
+
return_source_documents=True
|
| 277 |
+
)
|
| 278 |
+
return qa_chain
|
| 279 |
+
except Exception as openai_error:
|
| 280 |
+
logger.warning(f"OpenAI LLM failed: {openai_error}")
|
| 281 |
+
|
| 282 |
+
# Fallback to a simple text-based QA system
|
| 283 |
+
logger.info("Creating fallback QA system")
|
| 284 |
+
return FallbackQAChain(vectorstore)
|
| 285 |
+
|
| 286 |
+
except Exception as e:
|
| 287 |
+
logger.error(f"Error creating QA chain: {e}")
|
| 288 |
+
return None
|
| 289 |
+
|
| 290 |
+
def _create_fallback_qa_chain(self, vectorstore: FAISS):
|
| 291 |
+
"""
|
| 292 |
+
Create a fallback QA chain that works without OpenAI API.
|
| 293 |
+
|
| 294 |
+
Args:
|
| 295 |
+
vectorstore (FAISS): Vector store
|
| 296 |
+
|
| 297 |
+
Returns:
|
| 298 |
+
FallbackQAChain: Simple QA chain
|
| 299 |
+
"""
|
| 300 |
+
return FallbackQAChain(vectorstore)
|
| 301 |
+
|
| 302 |
+
def process_transcript(self, transcript_text: str, metadata: Dict[str, Any] = None) -> Dict[str, Any]:
|
| 303 |
+
"""
|
| 304 |
+
Process transcript text and create QA chain.
|
| 305 |
+
|
| 306 |
+
Args:
|
| 307 |
+
transcript_text (str): Transcript text
|
| 308 |
+
metadata (Dict[str, Any]): Video metadata
|
| 309 |
+
|
| 310 |
+
Returns:
|
| 311 |
+
Dict[str, Any]: Processing result with QA chain and vector store
|
| 312 |
+
"""
|
| 313 |
+
result = {
|
| 314 |
+
'success': False,
|
| 315 |
+
'qa_chain': None,
|
| 316 |
+
'vectorstore': None,
|
| 317 |
+
'documents': None,
|
| 318 |
+
'error': None
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
try:
|
| 322 |
+
# Create documents from transcript
|
| 323 |
+
documents = self.create_documents_from_text(transcript_text, metadata)
|
| 324 |
+
if not documents:
|
| 325 |
+
result['error'] = "Failed to create documents from transcript"
|
| 326 |
+
return result
|
| 327 |
+
|
| 328 |
+
# Create vector store
|
| 329 |
+
vectorstore = self.create_vector_store(documents)
|
| 330 |
+
if not vectorstore:
|
| 331 |
+
result['error'] = "Failed to create vector store"
|
| 332 |
+
return result
|
| 333 |
+
|
| 334 |
+
# Create QA chain
|
| 335 |
+
qa_chain = self.create_qa_chain(vectorstore)
|
| 336 |
+
if not qa_chain:
|
| 337 |
+
result['error'] = "Failed to create QA chain"
|
| 338 |
+
return result
|
| 339 |
+
|
| 340 |
+
result['success'] = True
|
| 341 |
+
result['qa_chain'] = qa_chain
|
| 342 |
+
result['vectorstore'] = vectorstore
|
| 343 |
+
result['documents'] = documents
|
| 344 |
+
|
| 345 |
+
except Exception as e:
|
| 346 |
+
result['error'] = f"Error processing transcript: {str(e)}"
|
| 347 |
+
logger.error(f"Error processing transcript: {e}")
|
| 348 |
+
|
| 349 |
+
return result
|
| 350 |
+
|
| 351 |
+
def ask_question(self, qa_chain: RetrievalQA, question: str) -> Dict[str, Any]:
|
| 352 |
+
"""
|
| 353 |
+
Ask a question using the QA chain.
|
| 354 |
+
|
| 355 |
+
Args:
|
| 356 |
+
qa_chain (RetrievalQA): QA chain
|
| 357 |
+
question (str): Question to ask
|
| 358 |
+
|
| 359 |
+
Returns:
|
| 360 |
+
Dict[str, Any]: Answer and source documents
|
| 361 |
+
"""
|
| 362 |
+
try:
|
| 363 |
+
result = qa_chain({"query": question})
|
| 364 |
+
return {
|
| 365 |
+
'success': True,
|
| 366 |
+
'answer': result['result'],
|
| 367 |
+
'source_documents': result.get('source_documents', []),
|
| 368 |
+
'error': None
|
| 369 |
+
}
|
| 370 |
+
except Exception as e:
|
| 371 |
+
logger.error(f"Error asking question: {e}")
|
| 372 |
+
return {
|
| 373 |
+
'success': False,
|
| 374 |
+
'answer': None,
|
| 375 |
+
'source_documents': [],
|
| 376 |
+
'error': f"Error processing question: {str(e)}"
|
| 377 |
+
}
|
src/src/utils/youtube_handler.py
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
YouTube video handling utilities for transcript extraction and metadata retrieval.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
import logging
|
| 8 |
+
import time
|
| 9 |
+
import random
|
| 10 |
+
from typing import Optional, Dict, Any, List
|
| 11 |
+
from pytube import YouTube
|
| 12 |
+
from youtube_transcript_api import (
|
| 13 |
+
YouTubeTranscriptApi,
|
| 14 |
+
TranscriptsDisabled,
|
| 15 |
+
NoTranscriptFound,
|
| 16 |
+
VideoUnavailable,
|
| 17 |
+
CouldNotRetrieveTranscript
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
class YouTubeHandler:
|
| 23 |
+
"""Handles YouTube video operations including transcript extraction and metadata retrieval."""
|
| 24 |
+
|
| 25 |
+
def __init__(self):
|
| 26 |
+
self.supported_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh']
|
| 27 |
+
# Rate limiting to prevent IP blocking
|
| 28 |
+
self.last_request_time = 0
|
| 29 |
+
self.min_request_interval = 3.0 # Minimum 3 seconds between requests
|
| 30 |
+
self.max_retries = 3
|
| 31 |
+
self.base_delay = 2.0
|
| 32 |
+
|
| 33 |
+
def _rate_limit(self):
|
| 34 |
+
"""Implement rate limiting to prevent IP blocking."""
|
| 35 |
+
current_time = time.time()
|
| 36 |
+
time_since_last_request = current_time - self.last_request_time
|
| 37 |
+
|
| 38 |
+
if time_since_last_request < self.min_request_interval:
|
| 39 |
+
sleep_time = self.min_request_interval - time_since_last_request
|
| 40 |
+
logger.info(f"Rate limiting: sleeping for {sleep_time:.2f} seconds")
|
| 41 |
+
time.sleep(sleep_time)
|
| 42 |
+
|
| 43 |
+
self.last_request_time = time.time()
|
| 44 |
+
|
| 45 |
+
def _exponential_backoff(self, attempt: int):
|
| 46 |
+
"""Implement exponential backoff for retries."""
|
| 47 |
+
delay = self.base_delay * (2 ** attempt) + random.uniform(0, 1)
|
| 48 |
+
logger.info(f"Exponential backoff: attempt {attempt + 1}, sleeping for {delay:.2f} seconds")
|
| 49 |
+
time.sleep(delay)
|
| 50 |
+
|
| 51 |
+
def validate_youtube_url(self, url: str) -> bool:
|
| 52 |
+
"""
|
| 53 |
+
Validate if the provided URL is a valid YouTube URL.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
url (str): YouTube URL to validate
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
bool: True if valid, False otherwise
|
| 60 |
+
"""
|
| 61 |
+
youtube_regex = re.compile(
|
| 62 |
+
r'(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/'
|
| 63 |
+
r'(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})'
|
| 64 |
+
)
|
| 65 |
+
return bool(youtube_regex.match(url))
|
| 66 |
+
|
| 67 |
+
def extract_video_id(self, url: str) -> Optional[str]:
|
| 68 |
+
"""
|
| 69 |
+
Extract video ID from YouTube URL.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
url (str): YouTube URL
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
Optional[str]: Video ID if found, None otherwise
|
| 76 |
+
"""
|
| 77 |
+
try:
|
| 78 |
+
yt = YouTube(url)
|
| 79 |
+
return yt.video_id
|
| 80 |
+
except Exception as e:
|
| 81 |
+
logger.error(f"Error extracting video ID: {e}")
|
| 82 |
+
return None
|
| 83 |
+
|
| 84 |
+
def get_video_metadata(self, url: str) -> Dict[str, Any]:
|
| 85 |
+
"""
|
| 86 |
+
Get video metadata including title, description, duration, etc.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
url (str): YouTube URL
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
Dict[str, Any]: Video metadata
|
| 93 |
+
"""
|
| 94 |
+
try:
|
| 95 |
+
yt = YouTube(url)
|
| 96 |
+
metadata = {
|
| 97 |
+
'title': yt.title,
|
| 98 |
+
'description': yt.description,
|
| 99 |
+
'length': yt.length,
|
| 100 |
+
'views': yt.views,
|
| 101 |
+
'rating': getattr(yt, 'rating', None),
|
| 102 |
+
'author': yt.author,
|
| 103 |
+
'publish_date': yt.publish_date,
|
| 104 |
+
'thumbnail_url': yt.thumbnail_url,
|
| 105 |
+
'video_id': yt.video_id
|
| 106 |
+
}
|
| 107 |
+
return metadata
|
| 108 |
+
except Exception as e:
|
| 109 |
+
logger.error(f"Error getting video metadata: {e}")
|
| 110 |
+
return {}
|
| 111 |
+
|
| 112 |
+
def get_available_transcripts(self, video_id: str) -> List[Dict[str, str]]:
|
| 113 |
+
"""
|
| 114 |
+
Get list of available transcript languages for a video.
|
| 115 |
+
|
| 116 |
+
Args:
|
| 117 |
+
video_id (str): YouTube video ID
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
List[Dict[str, str]]: List of available transcripts with language info
|
| 121 |
+
"""
|
| 122 |
+
try:
|
| 123 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
| 124 |
+
available = []
|
| 125 |
+
|
| 126 |
+
for transcript in transcript_list:
|
| 127 |
+
available.append({
|
| 128 |
+
'language': transcript.language,
|
| 129 |
+
'language_code': transcript.language_code,
|
| 130 |
+
'is_generated': transcript.is_generated,
|
| 131 |
+
'is_translatable': transcript.is_translatable
|
| 132 |
+
})
|
| 133 |
+
|
| 134 |
+
return available
|
| 135 |
+
except Exception as e:
|
| 136 |
+
logger.error(f"Error getting available transcripts: {e}")
|
| 137 |
+
return []
|
| 138 |
+
|
| 139 |
+
def get_youtube_transcript(self, url: str, language: str = 'en') -> Dict[str, Any]:
|
| 140 |
+
"""
|
| 141 |
+
Extract transcript from YouTube video with comprehensive error handling and rate limiting.
|
| 142 |
+
|
| 143 |
+
Args:
|
| 144 |
+
url (str): YouTube video URL
|
| 145 |
+
language (str): Preferred language code (default: 'en')
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
Dict[str, Any]: Dictionary containing transcript text and metadata
|
| 149 |
+
"""
|
| 150 |
+
result = {
|
| 151 |
+
'success': False,
|
| 152 |
+
'transcript': '',
|
| 153 |
+
'error': None,
|
| 154 |
+
'metadata': {},
|
| 155 |
+
'available_languages': []
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
try:
|
| 159 |
+
if not self.validate_youtube_url(url):
|
| 160 |
+
result['error'] = "Invalid YouTube URL format"
|
| 161 |
+
return result
|
| 162 |
+
|
| 163 |
+
video_id = self.extract_video_id(url)
|
| 164 |
+
if not video_id:
|
| 165 |
+
result['error'] = "Could not extract video ID from URL"
|
| 166 |
+
return result
|
| 167 |
+
|
| 168 |
+
# Apply rate limiting before making requests
|
| 169 |
+
self._rate_limit()
|
| 170 |
+
|
| 171 |
+
# Get video metadata
|
| 172 |
+
result['metadata'] = self.get_video_metadata(url)
|
| 173 |
+
|
| 174 |
+
# Apply rate limiting before transcript requests
|
| 175 |
+
self._rate_limit()
|
| 176 |
+
|
| 177 |
+
# Get available transcripts
|
| 178 |
+
result['available_languages'] = self.get_available_transcripts(video_id)
|
| 179 |
+
|
| 180 |
+
# Try to get transcript with multiple strategies and retries
|
| 181 |
+
transcript_data = None
|
| 182 |
+
used_language = None
|
| 183 |
+
|
| 184 |
+
# Strategy 1: Try the standard approach with retries
|
| 185 |
+
for attempt in range(self.max_retries):
|
| 186 |
+
try:
|
| 187 |
+
if attempt > 0:
|
| 188 |
+
self._exponential_backoff(attempt - 1)
|
| 189 |
+
|
| 190 |
+
self._rate_limit()
|
| 191 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
| 192 |
+
|
| 193 |
+
# Try preferred language first, then fallback to English, then any available
|
| 194 |
+
languages_to_try = [language] if language != 'en' else []
|
| 195 |
+
languages_to_try.extend(['en'])
|
| 196 |
+
languages_to_try.extend([lang['language_code'] for lang in result['available_languages']
|
| 197 |
+
if lang['language_code'] not in languages_to_try])
|
| 198 |
+
|
| 199 |
+
for lang in languages_to_try:
|
| 200 |
+
try:
|
| 201 |
+
transcript = transcript_list.find_transcript([lang])
|
| 202 |
+
transcript_data = transcript.fetch()
|
| 203 |
+
used_language = lang
|
| 204 |
+
logger.info(f"Successfully got transcript in {lang} on attempt {attempt + 1}")
|
| 205 |
+
break
|
| 206 |
+
except (NoTranscriptFound, TranscriptsDisabled):
|
| 207 |
+
continue
|
| 208 |
+
|
| 209 |
+
if transcript_data:
|
| 210 |
+
break
|
| 211 |
+
|
| 212 |
+
except Exception as e:
|
| 213 |
+
logger.warning(f"Standard transcript method failed on attempt {attempt + 1}: {e}")
|
| 214 |
+
if attempt == self.max_retries - 1:
|
| 215 |
+
logger.error(f"All {self.max_retries} attempts failed for standard method")
|
| 216 |
+
|
| 217 |
+
# Strategy 2: Try alternative approach if first failed
|
| 218 |
+
if not transcript_data:
|
| 219 |
+
for attempt in range(self.max_retries):
|
| 220 |
+
try:
|
| 221 |
+
if attempt > 0:
|
| 222 |
+
self._exponential_backoff(attempt - 1)
|
| 223 |
+
|
| 224 |
+
self._rate_limit()
|
| 225 |
+
# Try to get any available transcript without language preference
|
| 226 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
| 227 |
+
available_transcripts = list(transcript_list)
|
| 228 |
+
|
| 229 |
+
if available_transcripts:
|
| 230 |
+
# Try manual transcripts first
|
| 231 |
+
manual_transcripts = [t for t in available_transcripts if not t.is_generated]
|
| 232 |
+
if manual_transcripts:
|
| 233 |
+
transcript = manual_transcripts[0]
|
| 234 |
+
else:
|
| 235 |
+
transcript = available_transcripts[0]
|
| 236 |
+
|
| 237 |
+
transcript_data = transcript.fetch()
|
| 238 |
+
used_language = transcript.language_code
|
| 239 |
+
logger.info(f"Got transcript using alternative method in {used_language} on attempt {attempt + 1}")
|
| 240 |
+
break
|
| 241 |
+
|
| 242 |
+
except Exception as e:
|
| 243 |
+
logger.warning(f"Alternative transcript method failed on attempt {attempt + 1}: {e}")
|
| 244 |
+
if attempt == self.max_retries - 1:
|
| 245 |
+
logger.error(f"All {self.max_retries} attempts failed for alternative method")
|
| 246 |
+
|
| 247 |
+
# Strategy 3: Try basic method as last resort
|
| 248 |
+
if not transcript_data:
|
| 249 |
+
for attempt in range(self.max_retries):
|
| 250 |
+
try:
|
| 251 |
+
if attempt > 0:
|
| 252 |
+
self._exponential_backoff(attempt - 1)
|
| 253 |
+
|
| 254 |
+
self._rate_limit()
|
| 255 |
+
# This is a last resort - try with minimal parameters
|
| 256 |
+
transcript_data = YouTubeTranscriptApi.get_transcript(video_id)
|
| 257 |
+
used_language = 'auto-detected'
|
| 258 |
+
logger.info(f"Got transcript using basic method on attempt {attempt + 1}")
|
| 259 |
+
break
|
| 260 |
+
except Exception as e:
|
| 261 |
+
logger.warning(f"Basic transcript method failed on attempt {attempt + 1}: {e}")
|
| 262 |
+
if attempt == self.max_retries - 1:
|
| 263 |
+
logger.error(f"All {self.max_retries} attempts failed for basic method")
|
| 264 |
+
|
| 265 |
+
if transcript_data:
|
| 266 |
+
# Format transcript text - handle both dict and object formats
|
| 267 |
+
text_parts = []
|
| 268 |
+
formatted_transcript_data = []
|
| 269 |
+
|
| 270 |
+
for item in transcript_data:
|
| 271 |
+
if hasattr(item, 'text'):
|
| 272 |
+
# New format: object with attributes
|
| 273 |
+
text_parts.append(item.text)
|
| 274 |
+
formatted_transcript_data.append({
|
| 275 |
+
'text': item.text,
|
| 276 |
+
'start': getattr(item, 'start', 0),
|
| 277 |
+
'duration': getattr(item, 'duration', 0)
|
| 278 |
+
})
|
| 279 |
+
elif isinstance(item, dict) and 'text' in item:
|
| 280 |
+
# Old format: dictionary
|
| 281 |
+
text_parts.append(item['text'])
|
| 282 |
+
formatted_transcript_data.append(item)
|
| 283 |
+
else:
|
| 284 |
+
# Fallback: convert to string
|
| 285 |
+
text_parts.append(str(item))
|
| 286 |
+
formatted_transcript_data.append({'text': str(item), 'start': 0, 'duration': 0})
|
| 287 |
+
|
| 288 |
+
text = " ".join(text_parts)
|
| 289 |
+
result['transcript'] = text
|
| 290 |
+
result['success'] = True
|
| 291 |
+
result['used_language'] = used_language
|
| 292 |
+
result['transcript_data'] = formatted_transcript_data # Raw transcript with timestamps
|
| 293 |
+
else:
|
| 294 |
+
result['error'] = "No transcript available in any supported language"
|
| 295 |
+
|
| 296 |
+
except TranscriptsDisabled:
|
| 297 |
+
result['error'] = "Transcripts are disabled for this video"
|
| 298 |
+
except NoTranscriptFound:
|
| 299 |
+
result['error'] = "No transcript found for this video"
|
| 300 |
+
except VideoUnavailable:
|
| 301 |
+
result['error'] = "This video is unavailable"
|
| 302 |
+
except CouldNotRetrieveTranscript as e:
|
| 303 |
+
error_msg = str(e).lower()
|
| 304 |
+
if "ip" in error_msg and "block" in error_msg:
|
| 305 |
+
result['error'] = "IP blocked by YouTube: Too many requests from your IP address"
|
| 306 |
+
result['suggestion'] = "Wait 10-15 minutes before trying again, or try a different network"
|
| 307 |
+
result['details'] = "YouTube has temporarily blocked your IP due to too many requests. This is common when testing or using cloud services."
|
| 308 |
+
elif "region" in error_msg or "country" in error_msg:
|
| 309 |
+
result['error'] = "Regional restriction: This video's transcripts are not available in your region"
|
| 310 |
+
result['suggestion'] = "Try using a VPN or try a different video"
|
| 311 |
+
elif "private" in error_msg:
|
| 312 |
+
result['error'] = "This video is private and transcripts cannot be accessed"
|
| 313 |
+
elif "disabled" in error_msg:
|
| 314 |
+
result['error'] = "Transcripts are disabled for this video"
|
| 315 |
+
elif "cloud provider" in error_msg:
|
| 316 |
+
result['error'] = "Cloud provider IP blocked: YouTube blocks most cloud service IPs"
|
| 317 |
+
result['suggestion'] = "Try from a different network or wait before retrying"
|
| 318 |
+
result['details'] = "YouTube automatically blocks IPs from cloud providers like AWS, Google Cloud, etc."
|
| 319 |
+
else:
|
| 320 |
+
result['error'] = f"Could not retrieve transcript: {str(e)}"
|
| 321 |
+
logger.warning(f"Could not retrieve transcript for video: {e}")
|
| 322 |
+
except Exception as e:
|
| 323 |
+
error_msg = str(e).lower()
|
| 324 |
+
if "ip" in error_msg and ("block" in error_msg or "ban" in error_msg):
|
| 325 |
+
result['error'] = "IP blocked by YouTube: Too many requests from your IP address"
|
| 326 |
+
result['suggestion'] = "Wait 10-15 minutes before trying again, or try a different network"
|
| 327 |
+
result['details'] = "YouTube has temporarily blocked your IP due to too many requests. This is common when testing or using cloud services."
|
| 328 |
+
elif "cloud provider" in error_msg or "aws" in error_msg or "google cloud" in error_msg or "azure" in error_msg:
|
| 329 |
+
result['error'] = "Cloud provider IP blocked: YouTube blocks most cloud service IPs"
|
| 330 |
+
result['suggestion'] = "Try from a different network or wait before retrying"
|
| 331 |
+
result['details'] = "YouTube automatically blocks IPs from cloud providers like AWS, Google Cloud, etc."
|
| 332 |
+
elif "region" in error_msg or "country" in error_msg:
|
| 333 |
+
result['error'] = "Regional restriction: This video's transcripts are not available in your region"
|
| 334 |
+
result['suggestion'] = "Try using a VPN or try a different video"
|
| 335 |
+
elif "private" in error_msg:
|
| 336 |
+
result['error'] = "This video is private and transcripts cannot be accessed"
|
| 337 |
+
elif "unavailable" in error_msg:
|
| 338 |
+
result['error'] = "This video is unavailable or has been removed"
|
| 339 |
+
elif "disabled" in error_msg:
|
| 340 |
+
result['error'] = "Transcripts are disabled for this video"
|
| 341 |
+
elif "too many requests" in error_msg:
|
| 342 |
+
result['error'] = "Rate limited: Too many requests to YouTube"
|
| 343 |
+
result['suggestion'] = "Wait a few minutes before trying again"
|
| 344 |
+
result['details'] = "You've made too many requests to YouTube. Please wait before trying again."
|
| 345 |
+
else:
|
| 346 |
+
result['error'] = f"Unexpected error: {str(e)}"
|
| 347 |
+
logger.error(f"Unexpected error getting transcript: {e}")
|
| 348 |
+
|
| 349 |
+
return result
|
| 350 |
+
|
| 351 |
+
def save_transcript_to_file(self, transcript_text: str, filename: str = "transcript.txt") -> bool:
|
| 352 |
+
"""
|
| 353 |
+
Save transcript text to a file.
|
| 354 |
+
|
| 355 |
+
Args:
|
| 356 |
+
transcript_text (str): Transcript text to save
|
| 357 |
+
filename (str): Output filename
|
| 358 |
+
|
| 359 |
+
Returns:
|
| 360 |
+
bool: True if successful, False otherwise
|
| 361 |
+
"""
|
| 362 |
+
try:
|
| 363 |
+
os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else '.', exist_ok=True)
|
| 364 |
+
with open(filename, "w", encoding="utf-8") as f:
|
| 365 |
+
f.write(transcript_text)
|
| 366 |
+
return True
|
| 367 |
+
except Exception as e:
|
| 368 |
+
logger.error(f"Error saving transcript to file: {e}")
|
| 369 |
+
return False
|
src/static/style.css
ADDED
|
@@ -0,0 +1,501 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* YouTube Transcript Chatbot - Custom Styles - Dark Theme */
|
| 2 |
+
|
| 3 |
+
/* Global dark theme styling */
|
| 4 |
+
* {
|
| 5 |
+
box-sizing: border-box;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
body, html {
|
| 9 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Helvetica Neue', Arial, sans-serif;
|
| 10 |
+
line-height: 1.6;
|
| 11 |
+
color: #e9ecef !important;
|
| 12 |
+
background-color: #1a1a1a !important;
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
/* Ensure all text elements have proper contrast for dark theme */
|
| 16 |
+
p, span, div, label, h1, h2, h3, h4, h5, h6 {
|
| 17 |
+
color: #e9ecef !important;
|
| 18 |
+
text-rendering: optimizeLegibility;
|
| 19 |
+
-webkit-font-smoothing: antialiased;
|
| 20 |
+
-moz-osx-font-smoothing: grayscale;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
/* Main container styling */
|
| 24 |
+
.main-container {
|
| 25 |
+
max-width: 1200px;
|
| 26 |
+
margin: 0 auto;
|
| 27 |
+
padding: 20px;
|
| 28 |
+
background-color: #1a1a1a !important;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
/* Header styling */
|
| 32 |
+
.app-header {
|
| 33 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 34 |
+
color: white;
|
| 35 |
+
padding: 2rem;
|
| 36 |
+
border-radius: 10px;
|
| 37 |
+
margin-bottom: 2rem;
|
| 38 |
+
text-align: center;
|
| 39 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
.app-header h1 {
|
| 43 |
+
margin: 0;
|
| 44 |
+
font-size: 2.5rem;
|
| 45 |
+
font-weight: 700;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
.app-header p {
|
| 49 |
+
margin: 0.5rem 0 0 0;
|
| 50 |
+
font-size: 1.1rem;
|
| 51 |
+
opacity: 0.9;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
/* Card styling - Dark Theme */
|
| 55 |
+
.info-card {
|
| 56 |
+
background: #2d3748 !important;
|
| 57 |
+
border-radius: 10px;
|
| 58 |
+
padding: 1.5rem;
|
| 59 |
+
margin: 1rem 0;
|
| 60 |
+
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.3);
|
| 61 |
+
border-left: 4px solid #667eea;
|
| 62 |
+
color: #e9ecef !important;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
.success-card {
|
| 66 |
+
background: #1a2e1a !important;
|
| 67 |
+
border-color: #28a745;
|
| 68 |
+
color: #90ee90 !important;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
.error-card {
|
| 72 |
+
background: #2e1a1a !important;
|
| 73 |
+
border-color: #dc3545;
|
| 74 |
+
color: #ffb3b3 !important;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
.warning-card {
|
| 78 |
+
background: #2e2a1a !important;
|
| 79 |
+
border-color: #ffc107;
|
| 80 |
+
color: #ffe066 !important;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
/* Video metadata styling - Dark Theme */
|
| 84 |
+
.video-metadata {
|
| 85 |
+
background: #2d3748 !important;
|
| 86 |
+
border-radius: 8px;
|
| 87 |
+
padding: 1rem;
|
| 88 |
+
margin: 1rem 0;
|
| 89 |
+
border: 1px solid #4a5568;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
.video-metadata h4 {
|
| 93 |
+
color: #e9ecef !important;
|
| 94 |
+
margin-bottom: 0.5rem;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
.metadata-item {
|
| 98 |
+
display: flex;
|
| 99 |
+
justify-content: space-between;
|
| 100 |
+
padding: 0.25rem 0;
|
| 101 |
+
border-bottom: 1px solid #4a5568;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
.metadata-item:last-child {
|
| 105 |
+
border-bottom: none;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
.metadata-label {
|
| 109 |
+
font-weight: 600;
|
| 110 |
+
color: #a0aec0 !important;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
.metadata-value {
|
| 114 |
+
color: #e9ecef !important;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
/* Chat history styling - Dark Theme */
|
| 118 |
+
.chat-container {
|
| 119 |
+
max-height: 400px;
|
| 120 |
+
overflow-y: auto;
|
| 121 |
+
border: 1px solid #4a5568;
|
| 122 |
+
border-radius: 8px;
|
| 123 |
+
padding: 1rem;
|
| 124 |
+
background: #2d3748 !important;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
.chat-message {
|
| 128 |
+
margin-bottom: 1rem;
|
| 129 |
+
padding: 0.75rem;
|
| 130 |
+
border-radius: 8px;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
.chat-question {
|
| 134 |
+
background: #1a365d !important;
|
| 135 |
+
border-left: 4px solid #3182ce;
|
| 136 |
+
color: #e9ecef !important;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
.chat-answer {
|
| 140 |
+
background: #322659 !important;
|
| 141 |
+
border-left: 4px solid #9f7aea;
|
| 142 |
+
margin-left: 1rem;
|
| 143 |
+
color: #e9ecef !important;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
.chat-timestamp {
|
| 147 |
+
font-size: 0.8rem;
|
| 148 |
+
color: #a0aec0 !important;
|
| 149 |
+
margin-top: 0.5rem;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
/* Button styling */
|
| 153 |
+
.custom-button {
|
| 154 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 155 |
+
color: white;
|
| 156 |
+
border: none;
|
| 157 |
+
padding: 0.75rem 1.5rem;
|
| 158 |
+
border-radius: 6px;
|
| 159 |
+
font-weight: 600;
|
| 160 |
+
cursor: pointer;
|
| 161 |
+
transition: all 0.3s ease;
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
.custom-button:hover {
|
| 165 |
+
transform: translateY(-2px);
|
| 166 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
.secondary-button {
|
| 170 |
+
background: #6c757d;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
.success-button {
|
| 174 |
+
background: #28a745;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
.danger-button {
|
| 178 |
+
background: #dc3545;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
/* Loading animation */
|
| 182 |
+
.loading-spinner {
|
| 183 |
+
display: inline-block;
|
| 184 |
+
width: 20px;
|
| 185 |
+
height: 20px;
|
| 186 |
+
border: 3px solid #f3f3f3;
|
| 187 |
+
border-top: 3px solid #667eea;
|
| 188 |
+
border-radius: 50%;
|
| 189 |
+
animation: spin 1s linear infinite;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
@keyframes spin {
|
| 193 |
+
0% { transform: rotate(0deg); }
|
| 194 |
+
100% { transform: rotate(360deg); }
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
/* Progress bar */
|
| 198 |
+
.progress-bar {
|
| 199 |
+
width: 100%;
|
| 200 |
+
height: 6px;
|
| 201 |
+
background: #e9ecef;
|
| 202 |
+
border-radius: 3px;
|
| 203 |
+
overflow: hidden;
|
| 204 |
+
margin: 1rem 0;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
.progress-fill {
|
| 208 |
+
height: 100%;
|
| 209 |
+
background: linear-gradient(90deg, #667eea, #764ba2);
|
| 210 |
+
border-radius: 3px;
|
| 211 |
+
transition: width 0.3s ease;
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
/* Sidebar styling - Dark Theme */
|
| 215 |
+
.sidebar-content {
|
| 216 |
+
background: #2d3748 !important;
|
| 217 |
+
padding: 1rem;
|
| 218 |
+
border-radius: 8px;
|
| 219 |
+
margin-bottom: 1rem;
|
| 220 |
+
border: 1px solid #4a5568;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
.sidebar-section {
|
| 224 |
+
margin-bottom: 1.5rem;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
.sidebar-section h4 {
|
| 228 |
+
color: #e9ecef !important;
|
| 229 |
+
margin-bottom: 0.5rem;
|
| 230 |
+
font-size: 1.1rem;
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
/* Form styling */
|
| 234 |
+
.form-group {
|
| 235 |
+
margin-bottom: 1rem;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
.form-label {
|
| 239 |
+
display: block;
|
| 240 |
+
margin-bottom: 0.5rem;
|
| 241 |
+
font-weight: 600;
|
| 242 |
+
color: #495057;
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
.form-input {
|
| 246 |
+
width: 100%;
|
| 247 |
+
padding: 0.75rem;
|
| 248 |
+
border: 1px solid #ced4da;
|
| 249 |
+
border-radius: 6px;
|
| 250 |
+
font-size: 1rem;
|
| 251 |
+
transition: border-color 0.3s ease;
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
.form-input:focus {
|
| 255 |
+
outline: none;
|
| 256 |
+
border-color: #667eea;
|
| 257 |
+
box-shadow: 0 0 0 2px rgba(102, 126, 234, 0.25);
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
/* Responsive design */
|
| 261 |
+
@media (max-width: 768px) {
|
| 262 |
+
.app-header h1 {
|
| 263 |
+
font-size: 2rem;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
.main-container {
|
| 267 |
+
padding: 10px;
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
.info-card {
|
| 271 |
+
padding: 1rem;
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
.chat-answer {
|
| 275 |
+
margin-left: 0.5rem;
|
| 276 |
+
}
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
/* Streamlit specific overrides - Dark Theme */
|
| 280 |
+
|
| 281 |
+
/* Main app background */
|
| 282 |
+
.stApp {
|
| 283 |
+
background-color: #1a1a1a !important;
|
| 284 |
+
color: #e9ecef !important;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
.stApp > div {
|
| 288 |
+
background-color: #1a1a1a !important;
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
/* Main content area */
|
| 292 |
+
.main .block-container {
|
| 293 |
+
background-color: #1a1a1a !important;
|
| 294 |
+
color: #e9ecef !important;
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
/* Sidebar styling */
|
| 298 |
+
.css-1d391kg, .css-1lcbmhc, .css-1aumxhk {
|
| 299 |
+
background-color: #2d3748 !important;
|
| 300 |
+
color: #e9ecef !important;
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
/* Button styling */
|
| 304 |
+
.stButton > button {
|
| 305 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
|
| 306 |
+
color: white !important;
|
| 307 |
+
border: none !important;
|
| 308 |
+
border-radius: 6px !important;
|
| 309 |
+
font-weight: 600 !important;
|
| 310 |
+
transition: all 0.3s ease !important;
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
.stButton > button:hover {
|
| 314 |
+
transform: translateY(-2px) !important;
|
| 315 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.4) !important;
|
| 316 |
+
color: white !important;
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
/* Input fields */
|
| 320 |
+
.stSelectbox > div > div {
|
| 321 |
+
border-radius: 6px !important;
|
| 322 |
+
background-color: #2d3748 !important;
|
| 323 |
+
color: #e9ecef !important;
|
| 324 |
+
border: 1px solid #4a5568 !important;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
.stSelectbox label {
|
| 328 |
+
color: #e9ecef !important;
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
.stTextInput > div > div > input {
|
| 332 |
+
border-radius: 6px !important;
|
| 333 |
+
background-color: #2d3748 !important;
|
| 334 |
+
color: #e9ecef !important;
|
| 335 |
+
border: 1px solid #4a5568 !important;
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
.stTextInput label {
|
| 339 |
+
color: #e9ecef !important;
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
.stTextArea > div > div > textarea {
|
| 343 |
+
border-radius: 6px !important;
|
| 344 |
+
background-color: #2d3748 !important;
|
| 345 |
+
color: #e9ecef !important;
|
| 346 |
+
border: 1px solid #4a5568 !important;
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
.stTextArea label {
|
| 350 |
+
color: #e9ecef !important;
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
/* Success/Error message styling - Dark Theme */
|
| 354 |
+
.stSuccess {
|
| 355 |
+
background: #1a2e1a !important;
|
| 356 |
+
border: 1px solid #28a745 !important;
|
| 357 |
+
border-radius: 6px !important;
|
| 358 |
+
color: #90ee90 !important;
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
.stSuccess p {
|
| 362 |
+
color: #90ee90 !important;
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
.stError {
|
| 366 |
+
background: #2e1a1a !important;
|
| 367 |
+
border: 1px solid #dc3545 !important;
|
| 368 |
+
border-radius: 6px !important;
|
| 369 |
+
color: #ffb3b3 !important;
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
.stError p {
|
| 373 |
+
color: #ffb3b3 !important;
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
.stWarning {
|
| 377 |
+
background: #2e2a1a !important;
|
| 378 |
+
border: 1px solid #ffc107 !important;
|
| 379 |
+
border-radius: 6px !important;
|
| 380 |
+
color: #ffe066 !important;
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
.stWarning p {
|
| 384 |
+
color: #ffe066 !important;
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
.stInfo {
|
| 388 |
+
background: #1a2a2e !important;
|
| 389 |
+
border: 1px solid #17a2b8 !important;
|
| 390 |
+
border-radius: 6px !important;
|
| 391 |
+
color: #66d9ef !important;
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
.stInfo p {
|
| 395 |
+
color: #66d9ef !important;
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
/* Additional dark theme overrides */
|
| 399 |
+
.stMarkdown {
|
| 400 |
+
color: #e9ecef !important;
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
.stMarkdown p {
|
| 404 |
+
color: #e9ecef !important;
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
.stMarkdown h1, .stMarkdown h2, .stMarkdown h3, .stMarkdown h4, .stMarkdown h5, .stMarkdown h6 {
|
| 408 |
+
color: #e9ecef !important;
|
| 409 |
+
}
|
| 410 |
+
|
| 411 |
+
/* Expander styling */
|
| 412 |
+
.streamlit-expanderHeader {
|
| 413 |
+
background-color: #2d3748 !important;
|
| 414 |
+
color: #e9ecef !important;
|
| 415 |
+
border: 1px solid #4a5568 !important;
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
.streamlit-expanderContent {
|
| 419 |
+
background-color: #2d3748 !important;
|
| 420 |
+
color: #e9ecef !important;
|
| 421 |
+
border: 1px solid #4a5568 !important;
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
/* Metric styling */
|
| 425 |
+
.metric-container {
|
| 426 |
+
background-color: #2d3748 !important;
|
| 427 |
+
color: #e9ecef !important;
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
/* Code block styling */
|
| 431 |
+
.stCode {
|
| 432 |
+
background-color: #2d3748 !important;
|
| 433 |
+
color: #e9ecef !important;
|
| 434 |
+
border: 1px solid #4a5568 !important;
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
/* DataFrame styling */
|
| 438 |
+
.stDataFrame {
|
| 439 |
+
background-color: #2d3748 !important;
|
| 440 |
+
color: #e9ecef !important;
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
/* JSON display styling */
|
| 444 |
+
.stJson {
|
| 445 |
+
background-color: #2d3748 !important;
|
| 446 |
+
color: #e9ecef !important;
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
/* Spinner styling */
|
| 450 |
+
.stSpinner {
|
| 451 |
+
color: #667eea !important;
|
| 452 |
+
}
|
| 453 |
+
|
| 454 |
+
/* Progress bar styling */
|
| 455 |
+
.stProgress .st-bo {
|
| 456 |
+
background-color: #667eea !important;
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
/* Custom classes for dark theme */
|
| 460 |
+
.visible-text {
|
| 461 |
+
color: #e9ecef !important;
|
| 462 |
+
background-color: #2d3748 !important;
|
| 463 |
+
padding: 0.5rem !important;
|
| 464 |
+
border-radius: 4px !important;
|
| 465 |
+
border: 1px solid #4a5568 !important;
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
.high-contrast-text {
|
| 469 |
+
color: #ffffff !important;
|
| 470 |
+
background-color: #000000 !important;
|
| 471 |
+
font-weight: 600 !important;
|
| 472 |
+
padding: 0.5rem !important;
|
| 473 |
+
border-radius: 4px !important;
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
/* Override any remaining white backgrounds */
|
| 477 |
+
div[data-testid="stSidebar"] {
|
| 478 |
+
background-color: #2d3748 !important;
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
div[data-testid="stSidebar"] > div {
|
| 482 |
+
background-color: #2d3748 !important;
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
+
.css-1lcbmhc {
|
| 486 |
+
background-color: #2d3748 !important;
|
| 487 |
+
}
|
| 488 |
+
|
| 489 |
+
.css-1d391kg {
|
| 490 |
+
background-color: #1a1a1a !important;
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
/* Force dark theme on all containers */
|
| 494 |
+
.element-container {
|
| 495 |
+
background-color: transparent !important;
|
| 496 |
+
color: #e9ecef !important;
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
.stAlert {
|
| 500 |
+
color: #e9ecef !important;
|
| 501 |
+
}
|
src/tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Tests package
|
src/tests/test_session_manager.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for session manager functionality.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import unittest
|
| 6 |
+
from unittest.mock import patch, MagicMock
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
# Add src to path for imports
|
| 12 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
| 13 |
+
|
| 14 |
+
# Mock streamlit before importing session_manager
|
| 15 |
+
sys.modules['streamlit'] = MagicMock()
|
| 16 |
+
|
| 17 |
+
from src.utils.session_manager import SessionManager
|
| 18 |
+
|
| 19 |
+
class TestSessionManager(unittest.TestCase):
|
| 20 |
+
"""Test cases for SessionManager class."""
|
| 21 |
+
|
| 22 |
+
def setUp(self):
|
| 23 |
+
"""Set up test fixtures."""
|
| 24 |
+
# Mock streamlit session_state
|
| 25 |
+
self.mock_st = MagicMock()
|
| 26 |
+
self.mock_st.session_state = {}
|
| 27 |
+
|
| 28 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 29 |
+
self.session_manager = SessionManager()
|
| 30 |
+
|
| 31 |
+
def test_initialization(self):
|
| 32 |
+
"""Test SessionManager initialization."""
|
| 33 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 34 |
+
manager = SessionManager()
|
| 35 |
+
|
| 36 |
+
# Check that session state variables are initialized
|
| 37 |
+
expected_keys = [
|
| 38 |
+
'chat_history', 'processed_videos', 'current_video',
|
| 39 |
+
'qa_chain', 'vectorstore', 'video_metadata', 'conversation_id'
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
for key in expected_keys:
|
| 43 |
+
self.assertIn(key, self.mock_st.session_state)
|
| 44 |
+
|
| 45 |
+
def test_generate_conversation_id(self):
|
| 46 |
+
"""Test conversation ID generation."""
|
| 47 |
+
conv_id = self.session_manager.generate_conversation_id()
|
| 48 |
+
|
| 49 |
+
self.assertIsInstance(conv_id, str)
|
| 50 |
+
self.assertTrue(conv_id.startswith('conv_'))
|
| 51 |
+
self.assertEqual(len(conv_id), 19) # conv_ + YYYYMMDD_HHMMSS
|
| 52 |
+
|
| 53 |
+
def test_add_to_chat_history(self):
|
| 54 |
+
"""Test adding entries to chat history."""
|
| 55 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 56 |
+
self.mock_st.session_state = {
|
| 57 |
+
'chat_history': [],
|
| 58 |
+
'conversation_id': 'test_conv_123'
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
manager = SessionManager()
|
| 62 |
+
|
| 63 |
+
question = "What is this about?"
|
| 64 |
+
answer = "This is a test answer."
|
| 65 |
+
video_id = "test_video_123"
|
| 66 |
+
|
| 67 |
+
manager.add_to_chat_history(question, answer, video_id)
|
| 68 |
+
|
| 69 |
+
self.assertEqual(len(self.mock_st.session_state['chat_history']), 1)
|
| 70 |
+
|
| 71 |
+
entry = self.mock_st.session_state['chat_history'][0]
|
| 72 |
+
self.assertEqual(entry['question'], question)
|
| 73 |
+
self.assertEqual(entry['answer'], answer)
|
| 74 |
+
self.assertEqual(entry['video_id'], video_id)
|
| 75 |
+
self.assertEqual(entry['conversation_id'], 'test_conv_123')
|
| 76 |
+
self.assertIn('timestamp', entry)
|
| 77 |
+
|
| 78 |
+
def test_get_chat_history_all(self):
|
| 79 |
+
"""Test getting all chat history."""
|
| 80 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 81 |
+
test_history = [
|
| 82 |
+
{'question': 'Q1', 'answer': 'A1', 'video_id': 'vid1'},
|
| 83 |
+
{'question': 'Q2', 'answer': 'A2', 'video_id': 'vid2'}
|
| 84 |
+
]
|
| 85 |
+
self.mock_st.session_state = {'chat_history': test_history}
|
| 86 |
+
|
| 87 |
+
manager = SessionManager()
|
| 88 |
+
history = manager.get_chat_history()
|
| 89 |
+
|
| 90 |
+
self.assertEqual(history, test_history)
|
| 91 |
+
|
| 92 |
+
def test_get_chat_history_filtered(self):
|
| 93 |
+
"""Test getting filtered chat history by video ID."""
|
| 94 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 95 |
+
test_history = [
|
| 96 |
+
{'question': 'Q1', 'answer': 'A1', 'video_id': 'vid1'},
|
| 97 |
+
{'question': 'Q2', 'answer': 'A2', 'video_id': 'vid2'},
|
| 98 |
+
{'question': 'Q3', 'answer': 'A3', 'video_id': 'vid1'}
|
| 99 |
+
]
|
| 100 |
+
self.mock_st.session_state = {'chat_history': test_history}
|
| 101 |
+
|
| 102 |
+
manager = SessionManager()
|
| 103 |
+
history = manager.get_chat_history('vid1')
|
| 104 |
+
|
| 105 |
+
self.assertEqual(len(history), 2)
|
| 106 |
+
self.assertEqual(history[0]['video_id'], 'vid1')
|
| 107 |
+
self.assertEqual(history[1]['video_id'], 'vid1')
|
| 108 |
+
|
| 109 |
+
def test_clear_chat_history_all(self):
|
| 110 |
+
"""Test clearing all chat history."""
|
| 111 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 112 |
+
test_history = [
|
| 113 |
+
{'question': 'Q1', 'answer': 'A1', 'video_id': 'vid1'},
|
| 114 |
+
{'question': 'Q2', 'answer': 'A2', 'video_id': 'vid2'}
|
| 115 |
+
]
|
| 116 |
+
self.mock_st.session_state = {'chat_history': test_history}
|
| 117 |
+
|
| 118 |
+
manager = SessionManager()
|
| 119 |
+
manager.clear_chat_history()
|
| 120 |
+
|
| 121 |
+
self.assertEqual(self.mock_st.session_state['chat_history'], [])
|
| 122 |
+
|
| 123 |
+
def test_clear_chat_history_filtered(self):
|
| 124 |
+
"""Test clearing chat history for specific video."""
|
| 125 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 126 |
+
test_history = [
|
| 127 |
+
{'question': 'Q1', 'answer': 'A1', 'video_id': 'vid1'},
|
| 128 |
+
{'question': 'Q2', 'answer': 'A2', 'video_id': 'vid2'},
|
| 129 |
+
{'question': 'Q3', 'answer': 'A3', 'video_id': 'vid1'}
|
| 130 |
+
]
|
| 131 |
+
self.mock_st.session_state = {'chat_history': test_history}
|
| 132 |
+
|
| 133 |
+
manager = SessionManager()
|
| 134 |
+
manager.clear_chat_history('vid1')
|
| 135 |
+
|
| 136 |
+
remaining_history = self.mock_st.session_state['chat_history']
|
| 137 |
+
self.assertEqual(len(remaining_history), 1)
|
| 138 |
+
self.assertEqual(remaining_history[0]['video_id'], 'vid2')
|
| 139 |
+
|
| 140 |
+
def test_save_processed_video(self):
|
| 141 |
+
"""Test saving processed video information."""
|
| 142 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 143 |
+
self.mock_st.session_state = {
|
| 144 |
+
'processed_videos': {},
|
| 145 |
+
'conversation_id': 'test_conv_123'
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
manager = SessionManager()
|
| 149 |
+
|
| 150 |
+
video_url = "https://youtube.com/watch?v=test123"
|
| 151 |
+
video_id = "test123"
|
| 152 |
+
metadata = {"title": "Test Video", "author": "Test Author"}
|
| 153 |
+
transcript = "This is a test transcript."
|
| 154 |
+
qa_chain = MagicMock()
|
| 155 |
+
vectorstore = MagicMock()
|
| 156 |
+
|
| 157 |
+
manager.save_processed_video(
|
| 158 |
+
video_url, video_id, metadata, transcript, qa_chain, vectorstore
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# Check processed_videos
|
| 162 |
+
self.assertIn(video_id, self.mock_st.session_state['processed_videos'])
|
| 163 |
+
saved_video = self.mock_st.session_state['processed_videos'][video_id]
|
| 164 |
+
|
| 165 |
+
self.assertEqual(saved_video['url'], video_url)
|
| 166 |
+
self.assertEqual(saved_video['metadata'], metadata)
|
| 167 |
+
self.assertEqual(saved_video['transcript'], transcript)
|
| 168 |
+
self.assertEqual(saved_video['conversation_id'], 'test_conv_123')
|
| 169 |
+
self.assertIn('processed_at', saved_video)
|
| 170 |
+
|
| 171 |
+
# Check current session state
|
| 172 |
+
self.assertEqual(self.mock_st.session_state['current_video'], video_id)
|
| 173 |
+
self.assertEqual(self.mock_st.session_state['qa_chain'], qa_chain)
|
| 174 |
+
self.assertEqual(self.mock_st.session_state['vectorstore'], vectorstore)
|
| 175 |
+
self.assertEqual(self.mock_st.session_state['video_metadata'], metadata)
|
| 176 |
+
|
| 177 |
+
def test_get_processed_videos(self):
|
| 178 |
+
"""Test getting processed videos."""
|
| 179 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 180 |
+
test_videos = {
|
| 181 |
+
'vid1': {'title': 'Video 1'},
|
| 182 |
+
'vid2': {'title': 'Video 2'}
|
| 183 |
+
}
|
| 184 |
+
self.mock_st.session_state = {'processed_videos': test_videos}
|
| 185 |
+
|
| 186 |
+
manager = SessionManager()
|
| 187 |
+
videos = manager.get_processed_videos()
|
| 188 |
+
|
| 189 |
+
self.assertEqual(videos, test_videos)
|
| 190 |
+
|
| 191 |
+
def test_switch_to_video_success(self):
|
| 192 |
+
"""Test successful video switching."""
|
| 193 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 194 |
+
test_videos = {
|
| 195 |
+
'vid1': {'title': 'Video 1'},
|
| 196 |
+
'vid2': {'title': 'Video 2'}
|
| 197 |
+
}
|
| 198 |
+
self.mock_st.session_state = {'processed_videos': test_videos}
|
| 199 |
+
|
| 200 |
+
manager = SessionManager()
|
| 201 |
+
result = manager.switch_to_video('vid1')
|
| 202 |
+
|
| 203 |
+
self.assertTrue(result)
|
| 204 |
+
self.assertEqual(self.mock_st.session_state['current_video'], 'vid1')
|
| 205 |
+
|
| 206 |
+
def test_switch_to_video_failure(self):
|
| 207 |
+
"""Test video switching failure."""
|
| 208 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 209 |
+
self.mock_st.session_state = {'processed_videos': {}}
|
| 210 |
+
|
| 211 |
+
manager = SessionManager()
|
| 212 |
+
result = manager.switch_to_video('nonexistent_vid')
|
| 213 |
+
|
| 214 |
+
self.assertFalse(result)
|
| 215 |
+
|
| 216 |
+
def test_export_chat_history_json(self):
|
| 217 |
+
"""Test exporting chat history as JSON."""
|
| 218 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 219 |
+
test_history = [
|
| 220 |
+
{'question': 'Q1', 'answer': 'A1', 'timestamp': '2024-01-01T12:00:00'}
|
| 221 |
+
]
|
| 222 |
+
self.mock_st.session_state = {'chat_history': test_history}
|
| 223 |
+
|
| 224 |
+
manager = SessionManager()
|
| 225 |
+
result = manager.export_chat_history('json')
|
| 226 |
+
|
| 227 |
+
self.assertIsInstance(result, str)
|
| 228 |
+
self.assertIn('Q1', result)
|
| 229 |
+
self.assertIn('A1', result)
|
| 230 |
+
|
| 231 |
+
def test_export_chat_history_txt(self):
|
| 232 |
+
"""Test exporting chat history as text."""
|
| 233 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 234 |
+
test_history = [
|
| 235 |
+
{'question': 'Q1', 'answer': 'A1', 'timestamp': '2024-01-01T12:00:00'}
|
| 236 |
+
]
|
| 237 |
+
self.mock_st.session_state = {'chat_history': test_history}
|
| 238 |
+
|
| 239 |
+
manager = SessionManager()
|
| 240 |
+
result = manager.export_chat_history('txt')
|
| 241 |
+
|
| 242 |
+
self.assertIsInstance(result, str)
|
| 243 |
+
self.assertIn('Question: Q1', result)
|
| 244 |
+
self.assertIn('Answer: A1', result)
|
| 245 |
+
|
| 246 |
+
def test_get_session_stats(self):
|
| 247 |
+
"""Test getting session statistics."""
|
| 248 |
+
with patch('src.utils.session_manager.st', self.mock_st):
|
| 249 |
+
test_history = [
|
| 250 |
+
{'question': 'Q1', 'timestamp': '2024-01-01T12:00:00'},
|
| 251 |
+
{'question': 'Q2', 'timestamp': '2024-01-01T13:00:00'}
|
| 252 |
+
]
|
| 253 |
+
test_videos = {'vid1': {}, 'vid2': {}}
|
| 254 |
+
|
| 255 |
+
self.mock_st.session_state = {
|
| 256 |
+
'chat_history': test_history,
|
| 257 |
+
'processed_videos': test_videos,
|
| 258 |
+
'current_video': 'vid1',
|
| 259 |
+
'conversation_id': 'test_conv_123'
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
manager = SessionManager()
|
| 263 |
+
stats = manager.get_session_stats()
|
| 264 |
+
|
| 265 |
+
self.assertEqual(stats['total_questions'], 2)
|
| 266 |
+
self.assertEqual(stats['processed_videos'], 2)
|
| 267 |
+
self.assertEqual(stats['current_video'], 'vid1')
|
| 268 |
+
self.assertEqual(stats['conversation_id'], 'test_conv_123')
|
| 269 |
+
self.assertIn('session_start', stats)
|
| 270 |
+
|
| 271 |
+
if __name__ == '__main__':
|
| 272 |
+
unittest.main()
|
src/tests/test_text_processor.py
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for text processor functionality.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import unittest
|
| 6 |
+
from unittest.mock import patch, MagicMock
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
# Add src to path for imports
|
| 11 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
| 12 |
+
|
| 13 |
+
from src.utils.text_processor import TextProcessor
|
| 14 |
+
|
| 15 |
+
class TestTextProcessor(unittest.TestCase):
|
| 16 |
+
"""Test cases for TextProcessor class."""
|
| 17 |
+
|
| 18 |
+
def setUp(self):
|
| 19 |
+
"""Set up test fixtures."""
|
| 20 |
+
self.api_key = "test_api_key"
|
| 21 |
+
self.processor = TextProcessor(self.api_key)
|
| 22 |
+
|
| 23 |
+
def test_initialization(self):
|
| 24 |
+
"""Test TextProcessor initialization."""
|
| 25 |
+
self.assertEqual(self.processor.openai_api_key, self.api_key)
|
| 26 |
+
self.assertIsNotNone(self.processor.embeddings)
|
| 27 |
+
self.assertIsNotNone(self.processor.llm)
|
| 28 |
+
|
| 29 |
+
def test_create_documents_from_text(self):
|
| 30 |
+
"""Test document creation from text."""
|
| 31 |
+
text = "This is a test transcript. It has multiple sentences."
|
| 32 |
+
metadata = {"video_id": "test123", "title": "Test Video"}
|
| 33 |
+
|
| 34 |
+
documents = self.processor.create_documents_from_text(text, metadata)
|
| 35 |
+
|
| 36 |
+
self.assertIsInstance(documents, list)
|
| 37 |
+
self.assertGreater(len(documents), 0)
|
| 38 |
+
|
| 39 |
+
# Check first document
|
| 40 |
+
first_doc = documents[0]
|
| 41 |
+
self.assertIn("test transcript", first_doc.page_content.lower())
|
| 42 |
+
self.assertEqual(first_doc.metadata["video_id"], "test123")
|
| 43 |
+
self.assertEqual(first_doc.metadata["title"], "Test Video")
|
| 44 |
+
|
| 45 |
+
def test_create_documents_from_text_no_metadata(self):
|
| 46 |
+
"""Test document creation without metadata."""
|
| 47 |
+
text = "Simple test text."
|
| 48 |
+
|
| 49 |
+
documents = self.processor.create_documents_from_text(text)
|
| 50 |
+
|
| 51 |
+
self.assertIsInstance(documents, list)
|
| 52 |
+
self.assertGreater(len(documents), 0)
|
| 53 |
+
self.assertEqual(documents[0].metadata, {})
|
| 54 |
+
|
| 55 |
+
@patch('src.utils.text_processor.FAISS')
|
| 56 |
+
def test_create_vector_store_success(self, mock_faiss):
|
| 57 |
+
"""Test successful vector store creation."""
|
| 58 |
+
mock_vectorstore = MagicMock()
|
| 59 |
+
mock_faiss.from_documents.return_value = mock_vectorstore
|
| 60 |
+
|
| 61 |
+
documents = [MagicMock()]
|
| 62 |
+
result = self.processor.create_vector_store(documents)
|
| 63 |
+
|
| 64 |
+
self.assertEqual(result, mock_vectorstore)
|
| 65 |
+
mock_faiss.from_documents.assert_called_once_with(documents, self.processor.embeddings)
|
| 66 |
+
|
| 67 |
+
def test_create_vector_store_empty_documents(self):
|
| 68 |
+
"""Test vector store creation with empty documents."""
|
| 69 |
+
result = self.processor.create_vector_store([])
|
| 70 |
+
self.assertIsNone(result)
|
| 71 |
+
|
| 72 |
+
@patch('src.utils.text_processor.FAISS')
|
| 73 |
+
def test_create_vector_store_failure(self, mock_faiss):
|
| 74 |
+
"""Test vector store creation failure."""
|
| 75 |
+
mock_faiss.from_documents.side_effect = Exception("Test error")
|
| 76 |
+
|
| 77 |
+
documents = [MagicMock()]
|
| 78 |
+
result = self.processor.create_vector_store(documents)
|
| 79 |
+
|
| 80 |
+
self.assertIsNone(result)
|
| 81 |
+
|
| 82 |
+
@patch('src.utils.text_processor.RetrievalQA')
|
| 83 |
+
def test_create_qa_chain_success(self, mock_retrieval_qa):
|
| 84 |
+
"""Test successful QA chain creation."""
|
| 85 |
+
mock_qa_chain = MagicMock()
|
| 86 |
+
mock_retrieval_qa.from_chain_type.return_value = mock_qa_chain
|
| 87 |
+
|
| 88 |
+
mock_vectorstore = MagicMock()
|
| 89 |
+
mock_retriever = MagicMock()
|
| 90 |
+
mock_vectorstore.as_retriever.return_value = mock_retriever
|
| 91 |
+
|
| 92 |
+
result = self.processor.create_qa_chain(mock_vectorstore)
|
| 93 |
+
|
| 94 |
+
self.assertEqual(result, mock_qa_chain)
|
| 95 |
+
mock_vectorstore.as_retriever.assert_called_once()
|
| 96 |
+
mock_retrieval_qa.from_chain_type.assert_called_once()
|
| 97 |
+
|
| 98 |
+
@patch('src.utils.text_processor.RetrievalQA')
|
| 99 |
+
def test_create_qa_chain_failure(self, mock_retrieval_qa):
|
| 100 |
+
"""Test QA chain creation failure."""
|
| 101 |
+
mock_retrieval_qa.from_chain_type.side_effect = Exception("Test error")
|
| 102 |
+
|
| 103 |
+
mock_vectorstore = MagicMock()
|
| 104 |
+
result = self.processor.create_qa_chain(mock_vectorstore)
|
| 105 |
+
|
| 106 |
+
self.assertIsNone(result)
|
| 107 |
+
|
| 108 |
+
def test_ask_question_success(self):
|
| 109 |
+
"""Test successful question asking."""
|
| 110 |
+
mock_qa_chain = MagicMock()
|
| 111 |
+
mock_qa_chain.return_value = {
|
| 112 |
+
'result': 'Test answer',
|
| 113 |
+
'source_documents': [MagicMock()]
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
question = "What is this about?"
|
| 117 |
+
result = self.processor.ask_question(mock_qa_chain, question)
|
| 118 |
+
|
| 119 |
+
self.assertTrue(result['success'])
|
| 120 |
+
self.assertEqual(result['answer'], 'Test answer')
|
| 121 |
+
self.assertIsNotNone(result['source_documents'])
|
| 122 |
+
self.assertIsNone(result['error'])
|
| 123 |
+
|
| 124 |
+
mock_qa_chain.assert_called_once_with({"query": question})
|
| 125 |
+
|
| 126 |
+
def test_ask_question_failure(self):
|
| 127 |
+
"""Test question asking failure."""
|
| 128 |
+
mock_qa_chain = MagicMock()
|
| 129 |
+
mock_qa_chain.side_effect = Exception("Test error")
|
| 130 |
+
|
| 131 |
+
question = "What is this about?"
|
| 132 |
+
result = self.processor.ask_question(mock_qa_chain, question)
|
| 133 |
+
|
| 134 |
+
self.assertFalse(result['success'])
|
| 135 |
+
self.assertIsNone(result['answer'])
|
| 136 |
+
self.assertEqual(result['source_documents'], [])
|
| 137 |
+
self.assertIsNotNone(result['error'])
|
| 138 |
+
|
| 139 |
+
@patch.object(TextProcessor, 'create_qa_chain')
|
| 140 |
+
@patch.object(TextProcessor, 'create_vector_store')
|
| 141 |
+
@patch.object(TextProcessor, 'create_documents_from_text')
|
| 142 |
+
def test_process_transcript_success(self, mock_create_docs, mock_create_vs, mock_create_qa):
|
| 143 |
+
"""Test successful transcript processing."""
|
| 144 |
+
# Setup mocks
|
| 145 |
+
mock_documents = [MagicMock()]
|
| 146 |
+
mock_vectorstore = MagicMock()
|
| 147 |
+
mock_qa_chain = MagicMock()
|
| 148 |
+
|
| 149 |
+
mock_create_docs.return_value = mock_documents
|
| 150 |
+
mock_create_vs.return_value = mock_vectorstore
|
| 151 |
+
mock_create_qa.return_value = mock_qa_chain
|
| 152 |
+
|
| 153 |
+
transcript_text = "Test transcript text"
|
| 154 |
+
metadata = {"video_id": "test123"}
|
| 155 |
+
|
| 156 |
+
result = self.processor.process_transcript(transcript_text, metadata)
|
| 157 |
+
|
| 158 |
+
self.assertTrue(result['success'])
|
| 159 |
+
self.assertEqual(result['qa_chain'], mock_qa_chain)
|
| 160 |
+
self.assertEqual(result['vectorstore'], mock_vectorstore)
|
| 161 |
+
self.assertEqual(result['documents'], mock_documents)
|
| 162 |
+
self.assertIsNone(result['error'])
|
| 163 |
+
|
| 164 |
+
mock_create_docs.assert_called_once_with(transcript_text, metadata)
|
| 165 |
+
mock_create_vs.assert_called_once_with(mock_documents)
|
| 166 |
+
mock_create_qa.assert_called_once_with(mock_vectorstore)
|
| 167 |
+
|
| 168 |
+
@patch.object(TextProcessor, 'create_documents_from_text')
|
| 169 |
+
def test_process_transcript_document_creation_failure(self, mock_create_docs):
|
| 170 |
+
"""Test transcript processing with document creation failure."""
|
| 171 |
+
mock_create_docs.return_value = []
|
| 172 |
+
|
| 173 |
+
transcript_text = "Test transcript text"
|
| 174 |
+
result = self.processor.process_transcript(transcript_text)
|
| 175 |
+
|
| 176 |
+
self.assertFalse(result['success'])
|
| 177 |
+
self.assertIsNone(result['qa_chain'])
|
| 178 |
+
self.assertIsNone(result['vectorstore'])
|
| 179 |
+
self.assertIsNone(result['documents'])
|
| 180 |
+
self.assertEqual(result['error'], "Failed to create documents from transcript")
|
| 181 |
+
|
| 182 |
+
@patch.object(TextProcessor, 'create_vector_store')
|
| 183 |
+
@patch.object(TextProcessor, 'create_documents_from_text')
|
| 184 |
+
def test_process_transcript_vectorstore_creation_failure(self, mock_create_docs, mock_create_vs):
|
| 185 |
+
"""Test transcript processing with vector store creation failure."""
|
| 186 |
+
mock_create_docs.return_value = [MagicMock()]
|
| 187 |
+
mock_create_vs.return_value = None
|
| 188 |
+
|
| 189 |
+
transcript_text = "Test transcript text"
|
| 190 |
+
result = self.processor.process_transcript(transcript_text)
|
| 191 |
+
|
| 192 |
+
self.assertFalse(result['success'])
|
| 193 |
+
self.assertIsNone(result['qa_chain'])
|
| 194 |
+
self.assertIsNone(result['vectorstore'])
|
| 195 |
+
self.assertIsNotNone(result['documents'])
|
| 196 |
+
self.assertEqual(result['error'], "Failed to create vector store")
|
| 197 |
+
|
| 198 |
+
@patch.object(TextProcessor, 'create_qa_chain')
|
| 199 |
+
@patch.object(TextProcessor, 'create_vector_store')
|
| 200 |
+
@patch.object(TextProcessor, 'create_documents_from_text')
|
| 201 |
+
def test_process_transcript_qa_chain_creation_failure(self, mock_create_docs, mock_create_vs, mock_create_qa):
|
| 202 |
+
"""Test transcript processing with QA chain creation failure."""
|
| 203 |
+
mock_create_docs.return_value = [MagicMock()]
|
| 204 |
+
mock_create_vs.return_value = MagicMock()
|
| 205 |
+
mock_create_qa.return_value = None
|
| 206 |
+
|
| 207 |
+
transcript_text = "Test transcript text"
|
| 208 |
+
result = self.processor.process_transcript(transcript_text)
|
| 209 |
+
|
| 210 |
+
self.assertFalse(result['success'])
|
| 211 |
+
self.assertIsNone(result['qa_chain'])
|
| 212 |
+
self.assertIsNotNone(result['vectorstore'])
|
| 213 |
+
self.assertIsNotNone(result['documents'])
|
| 214 |
+
self.assertEqual(result['error'], "Failed to create QA chain")
|
| 215 |
+
|
| 216 |
+
if __name__ == '__main__':
|
| 217 |
+
unittest.main()
|
src/tests/test_youtube_handler.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for YouTube handler functionality.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import unittest
|
| 6 |
+
from unittest.mock import patch, MagicMock
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
# Add src to path for imports
|
| 11 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
| 12 |
+
|
| 13 |
+
from src.utils.youtube_handler import YouTubeHandler
|
| 14 |
+
|
| 15 |
+
class TestYouTubeHandler(unittest.TestCase):
|
| 16 |
+
"""Test cases for YouTubeHandler class."""
|
| 17 |
+
|
| 18 |
+
def setUp(self):
|
| 19 |
+
"""Set up test fixtures."""
|
| 20 |
+
self.handler = YouTubeHandler()
|
| 21 |
+
|
| 22 |
+
def test_validate_youtube_url_valid(self):
|
| 23 |
+
"""Test URL validation with valid URLs."""
|
| 24 |
+
valid_urls = [
|
| 25 |
+
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
|
| 26 |
+
"https://youtu.be/dQw4w9WgXcQ",
|
| 27 |
+
"http://youtube.com/watch?v=dQw4w9WgXcQ",
|
| 28 |
+
"https://m.youtube.com/watch?v=dQw4w9WgXcQ"
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
for url in valid_urls:
|
| 32 |
+
with self.subTest(url=url):
|
| 33 |
+
self.assertTrue(self.handler.validate_youtube_url(url))
|
| 34 |
+
|
| 35 |
+
def test_validate_youtube_url_invalid(self):
|
| 36 |
+
"""Test URL validation with invalid URLs."""
|
| 37 |
+
invalid_urls = [
|
| 38 |
+
"https://www.google.com",
|
| 39 |
+
"not_a_url",
|
| 40 |
+
"https://vimeo.com/123456",
|
| 41 |
+
"",
|
| 42 |
+
None
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
for url in invalid_urls:
|
| 46 |
+
with self.subTest(url=url):
|
| 47 |
+
if url is not None:
|
| 48 |
+
self.assertFalse(self.handler.validate_youtube_url(url))
|
| 49 |
+
|
| 50 |
+
@patch('src.utils.youtube_handler.YouTube')
|
| 51 |
+
def test_extract_video_id_success(self, mock_youtube):
|
| 52 |
+
"""Test successful video ID extraction."""
|
| 53 |
+
mock_yt = MagicMock()
|
| 54 |
+
mock_yt.video_id = "dQw4w9WgXcQ"
|
| 55 |
+
mock_youtube.return_value = mock_yt
|
| 56 |
+
|
| 57 |
+
video_id = self.handler.extract_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
|
| 58 |
+
self.assertEqual(video_id, "dQw4w9WgXcQ")
|
| 59 |
+
|
| 60 |
+
@patch('src.utils.youtube_handler.YouTube')
|
| 61 |
+
def test_extract_video_id_failure(self, mock_youtube):
|
| 62 |
+
"""Test video ID extraction failure."""
|
| 63 |
+
mock_youtube.side_effect = Exception("Invalid URL")
|
| 64 |
+
|
| 65 |
+
video_id = self.handler.extract_video_id("invalid_url")
|
| 66 |
+
self.assertIsNone(video_id)
|
| 67 |
+
|
| 68 |
+
@patch('src.utils.youtube_handler.YouTube')
|
| 69 |
+
def test_get_video_metadata_success(self, mock_youtube):
|
| 70 |
+
"""Test successful video metadata retrieval."""
|
| 71 |
+
mock_yt = MagicMock()
|
| 72 |
+
mock_yt.title = "Test Video"
|
| 73 |
+
mock_yt.author = "Test Author"
|
| 74 |
+
mock_yt.length = 300
|
| 75 |
+
mock_yt.views = 1000
|
| 76 |
+
mock_yt.video_id = "dQw4w9WgXcQ"
|
| 77 |
+
mock_youtube.return_value = mock_yt
|
| 78 |
+
|
| 79 |
+
metadata = self.handler.get_video_metadata("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
|
| 80 |
+
|
| 81 |
+
self.assertEqual(metadata['title'], "Test Video")
|
| 82 |
+
self.assertEqual(metadata['author'], "Test Author")
|
| 83 |
+
self.assertEqual(metadata['length'], 300)
|
| 84 |
+
self.assertEqual(metadata['views'], 1000)
|
| 85 |
+
self.assertEqual(metadata['video_id'], "dQw4w9WgXcQ")
|
| 86 |
+
|
| 87 |
+
@patch('src.utils.youtube_handler.YouTube')
|
| 88 |
+
def test_get_video_metadata_failure(self, mock_youtube):
|
| 89 |
+
"""Test video metadata retrieval failure."""
|
| 90 |
+
mock_youtube.side_effect = Exception("Network error")
|
| 91 |
+
|
| 92 |
+
metadata = self.handler.get_video_metadata("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
|
| 93 |
+
self.assertEqual(metadata, {})
|
| 94 |
+
|
| 95 |
+
def test_save_transcript_to_file(self):
|
| 96 |
+
"""Test transcript file saving."""
|
| 97 |
+
test_text = "This is a test transcript."
|
| 98 |
+
test_file = "test_transcript.txt"
|
| 99 |
+
|
| 100 |
+
try:
|
| 101 |
+
result = self.handler.save_transcript_to_file(test_text, test_file)
|
| 102 |
+
self.assertTrue(result)
|
| 103 |
+
|
| 104 |
+
# Verify file was created and contains correct content
|
| 105 |
+
with open(test_file, 'r', encoding='utf-8') as f:
|
| 106 |
+
content = f.read()
|
| 107 |
+
self.assertEqual(content, test_text)
|
| 108 |
+
|
| 109 |
+
finally:
|
| 110 |
+
# Clean up
|
| 111 |
+
if os.path.exists(test_file):
|
| 112 |
+
os.remove(test_file)
|
| 113 |
+
|
| 114 |
+
if __name__ == '__main__':
|
| 115 |
+
unittest.main()
|