midlajvalappil commited on
Commit
f1ba2d2
·
verified ·
1 Parent(s): 7086bdb

Upload 29 files

Browse files
src/config/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Configuration package
src/config/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (157 Bytes). View file
 
src/config/__pycache__/settings.cpython-310.pyc ADDED
Binary file (5.8 kB). View file
 
src/config/config.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # YouTube Transcript Chatbot Configuration
2
+
3
+ app:
4
+ title: "AI-Powered YouTube Transcript Tutor"
5
+ description: "Ask questions from YouTube lecture transcripts using AI"
6
+ version: "1.0.0"
7
+
8
+ ui:
9
+ theme: "light" # light, dark, auto
10
+ sidebar_width: 300
11
+ max_chat_history_display: 50
12
+ enable_animations: true
13
+
14
+ processing:
15
+ default_chunk_size: 1000
16
+ chunk_overlap: 200
17
+ max_transcript_length: 1000000 # 1MB
18
+ supported_languages: ["en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh"]
19
+ default_language: "en"
20
+
21
+ ai:
22
+ model_temperature: 0.7
23
+ max_tokens: 2000
24
+ retrieval_k: 4 # Number of documents to retrieve
25
+ chain_type: "stuff" # stuff, map_reduce, refine, map_rerank
26
+
27
+ export:
28
+ formats: ["pdf", "txt", "json"]
29
+ max_export_entries: 1000
30
+ pdf_page_size: "A4"
31
+
32
+ cache:
33
+ enable_vectorstore_cache: true
34
+ cache_directory: "cache"
35
+ max_cache_size_mb: 500
36
+
37
+ logging:
38
+ level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
39
+ file: "logs/app.log"
40
+ max_file_size_mb: 10
41
+ backup_count: 5
42
+
43
+ security:
44
+ max_url_length: 2048
45
+ allowed_domains: ["youtube.com", "youtu.be", "m.youtube.com"]
46
+ rate_limit_requests: 100
47
+ rate_limit_window_minutes: 60
src/config/huggingface.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces specific configuration
2
+
3
+ app:
4
+ title: "AI-Powered YouTube Transcript Tutor"
5
+ description: "Ask questions from YouTube lecture transcripts using AI"
6
+
7
+ logging:
8
+ level: "INFO"
9
+ file: null # Disable file logging
10
+
11
+ cache:
12
+ enable_vectorstore_cache: false
13
+ cache_directory: null
14
+
15
+ security:
16
+ youtube_api_fallback: true # Enable fallback methods for YouTube API
src/config/settings.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration settings management.
3
+ """
4
+
5
+ import os
6
+ import yaml
7
+ from typing import Dict, Any, Optional
8
+ from pathlib import Path
9
+
10
+ class Settings:
11
+ """Application settings manager."""
12
+
13
+ def __init__(self, config_file: str = "config/config.yaml"):
14
+ """
15
+ Initialize settings from config file and environment variables.
16
+
17
+ Args:
18
+ config_file (str): Path to configuration file
19
+ """
20
+ self.config_file = config_file
21
+ self.config = self._load_config()
22
+ self._override_with_env()
23
+
24
+ def _load_config(self) -> Dict[str, Any]:
25
+ """Load configuration from YAML file."""
26
+ try:
27
+ config_path = Path(self.config_file)
28
+ if config_path.exists():
29
+ with open(config_path, 'r', encoding='utf-8') as f:
30
+ return yaml.safe_load(f) or {}
31
+ else:
32
+ return self._get_default_config()
33
+ except Exception as e:
34
+ print(f"Error loading config file: {e}")
35
+ return self._get_default_config()
36
+
37
+ def _get_default_config(self) -> Dict[str, Any]:
38
+ """Get default configuration."""
39
+ return {
40
+ 'app': {
41
+ 'title': 'AI-Powered YouTube Transcript Tutor',
42
+ 'description': 'Ask questions from YouTube lecture transcripts using AI',
43
+ 'version': '1.0.0'
44
+ },
45
+ 'ui': {
46
+ 'theme': 'light',
47
+ 'sidebar_width': 300,
48
+ 'max_chat_history_display': 50,
49
+ 'enable_animations': True
50
+ },
51
+ 'processing': {
52
+ 'default_chunk_size': 1000,
53
+ 'chunk_overlap': 200,
54
+ 'max_transcript_length': 1000000,
55
+ 'supported_languages': ['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh'],
56
+ 'default_language': 'en'
57
+ },
58
+ 'ai': {
59
+ 'model_temperature': 0.7,
60
+ 'max_tokens': 2000,
61
+ 'retrieval_k': 4,
62
+ 'chain_type': 'stuff'
63
+ },
64
+ 'export': {
65
+ 'formats': ['pdf', 'txt', 'json'],
66
+ 'max_export_entries': 1000,
67
+ 'pdf_page_size': 'A4'
68
+ },
69
+ 'cache': {
70
+ 'enable_vectorstore_cache': True,
71
+ 'cache_directory': 'cache',
72
+ 'max_cache_size_mb': 500
73
+ },
74
+ 'logging': {
75
+ 'level': 'INFO',
76
+ 'file': 'logs/app.log',
77
+ 'max_file_size_mb': 10,
78
+ 'backup_count': 5
79
+ },
80
+ 'security': {
81
+ 'max_url_length': 2048,
82
+ 'allowed_domains': ['youtube.com', 'youtu.be', 'm.youtube.com'],
83
+ 'rate_limit_requests': 100,
84
+ 'rate_limit_window_minutes': 60
85
+ }
86
+ }
87
+
88
+ def _override_with_env(self):
89
+ """Override configuration with environment variables."""
90
+ # OpenAI API Key
91
+ openai_key = os.getenv('OPENAI_API_KEY')
92
+ if openai_key:
93
+ if 'ai' not in self.config:
94
+ self.config['ai'] = {}
95
+ self.config['ai']['openai_api_key'] = openai_key
96
+
97
+ # Log level
98
+ log_level = os.getenv('LOG_LEVEL')
99
+ if log_level:
100
+ self.config['logging']['level'] = log_level.upper()
101
+
102
+ # Cache directory
103
+ cache_dir = os.getenv('CACHE_DIRECTORY')
104
+ if cache_dir:
105
+ self.config['cache']['cache_directory'] = cache_dir
106
+
107
+ def get(self, key: str, default: Any = None) -> Any:
108
+ """
109
+ Get configuration value using dot notation.
110
+
111
+ Args:
112
+ key (str): Configuration key (e.g., 'app.title')
113
+ default (Any): Default value if key not found
114
+
115
+ Returns:
116
+ Any: Configuration value
117
+ """
118
+ keys = key.split('.')
119
+ value = self.config
120
+
121
+ try:
122
+ for k in keys:
123
+ value = value[k]
124
+ return value
125
+ except (KeyError, TypeError):
126
+ return default
127
+
128
+ def set(self, key: str, value: Any):
129
+ """
130
+ Set configuration value using dot notation.
131
+
132
+ Args:
133
+ key (str): Configuration key (e.g., 'app.title')
134
+ value (Any): Value to set
135
+ """
136
+ keys = key.split('.')
137
+ config = self.config
138
+
139
+ for k in keys[:-1]:
140
+ if k not in config:
141
+ config[k] = {}
142
+ config = config[k]
143
+
144
+ config[keys[-1]] = value
145
+
146
+ def get_openai_api_key(self) -> Optional[str]:
147
+ """Get OpenAI API key from config or environment."""
148
+ return self.get('ai.openai_api_key') or os.getenv('OPENAI_API_KEY')
149
+
150
+ def get_app_config(self) -> Dict[str, Any]:
151
+ """Get application configuration."""
152
+ return self.get('app', {})
153
+
154
+ def get_ui_config(self) -> Dict[str, Any]:
155
+ """Get UI configuration."""
156
+ return self.get('ui', {})
157
+
158
+ def get_processing_config(self) -> Dict[str, Any]:
159
+ """Get processing configuration."""
160
+ return self.get('processing', {})
161
+
162
+ def get_ai_config(self) -> Dict[str, Any]:
163
+ """Get AI configuration."""
164
+ return self.get('ai', {})
165
+
166
+ def get_export_config(self) -> Dict[str, Any]:
167
+ """Get export configuration."""
168
+ return self.get('export', {})
169
+
170
+ def get_cache_config(self) -> Dict[str, Any]:
171
+ """Get cache configuration."""
172
+ return self.get('cache', {})
173
+
174
+ def get_logging_config(self) -> Dict[str, Any]:
175
+ """Get logging configuration."""
176
+ return self.get('logging', {})
177
+
178
+ def get_security_config(self) -> Dict[str, Any]:
179
+ """Get security configuration."""
180
+ return self.get('security', {})
181
+
182
+ # Global settings instance
183
+ settings = Settings()
src/logs/app.log ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-13 22:55:30,859 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
2
+ 2025-07-13 22:55:37,615 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
3
+ 2025-07-13 22:56:36,886 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
4
+ 2025-07-13 22:56:40,977 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
5
+ 2025-07-13 22:56:54,360 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
6
+ 2025-07-13 22:57:04,282 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
7
+ 2025-07-13 22:58:12,592 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
8
+ 2025-07-13 22:58:21,552 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
9
+ 2025-07-13 22:58:38,183 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
10
+ 2025-07-13 22:58:41,834 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
11
+ 2025-07-13 22:59:16,207 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
12
+ 2025-07-13 22:59:22,975 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
13
+ 2025-07-13 22:59:23,716 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
14
+ 2025-07-13 22:59:26,323 - src.utils.youtube_handler - ERROR - Unexpected error getting transcript: 'FetchedTranscriptSnippet' object is not subscriptable
15
+ 2025-07-13 23:01:13,950 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
16
+ 2025-07-13 23:01:35,772 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
17
+ 2025-07-13 23:01:35,774 - openai._base_client - INFO - Retrying request to /embeddings in 0.378161 seconds
18
+ 2025-07-13 23:01:37,503 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
19
+ 2025-07-13 23:01:37,505 - openai._base_client - INFO - Retrying request to /embeddings in 0.796060 seconds
20
+ 2025-07-13 23:01:39,284 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
21
+ 2025-07-13 23:01:39,286 - src.utils.text_processor - ERROR - Error creating vector store: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
22
+ 2025-07-13 23:02:22,588 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
23
+ 2025-07-13 23:02:36,283 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
24
+ 2025-07-13 23:02:36,285 - openai._base_client - INFO - Retrying request to /embeddings in 0.379324 seconds
25
+ 2025-07-13 23:02:37,475 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
26
+ 2025-07-13 23:02:37,476 - openai._base_client - INFO - Retrying request to /embeddings in 0.943958 seconds
27
+ 2025-07-13 23:02:39,327 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
28
+ 2025-07-13 23:02:39,328 - src.utils.text_processor - ERROR - Error creating vector store: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
29
+ 2025-07-13 23:09:22,969 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
30
+ 2025-07-13 23:09:26,985 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
31
+ 2025-07-13 23:09:26,986 - openai._base_client - INFO - Retrying request to /embeddings in 0.395765 seconds
32
+ 2025-07-13 23:09:27,911 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
33
+ 2025-07-13 23:09:27,913 - openai._base_client - INFO - Retrying request to /embeddings in 0.940555 seconds
34
+ 2025-07-13 23:09:29,552 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
35
+ 2025-07-13 23:09:29,554 - src.utils.text_processor - WARNING - OpenAI embeddings failed: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
36
+ 2025-07-13 23:09:29,554 - src.utils.text_processor - INFO - Using simple text-based fallback
37
+ 2025-07-13 23:09:29,554 - src.utils.text_processor - INFO - Created simple text-based fallback vector store
38
+ 2025-07-13 23:09:29,555 - src.utils.text_processor - INFO - Using simple fallback QA system
39
+ 2025-07-13 23:15:01,397 - src.utils.logger - INFO - Custom CSS loaded successfully
40
+ 2025-07-13 23:15:05,056 - src.utils.logger - INFO - Custom CSS loaded successfully
41
+ 2025-07-13 23:15:15,923 - src.utils.logger - INFO - Custom CSS loaded successfully
42
+ 2025-07-13 23:15:17,491 - src.utils.logger - INFO - Custom CSS loaded successfully
43
+ 2025-07-13 23:15:19,654 - src.utils.logger - INFO - Custom CSS loaded successfully
44
+ 2025-07-13 23:15:23,535 - src.utils.logger - INFO - Custom CSS loaded successfully
45
+ 2025-07-13 23:16:06,979 - src.utils.logger - INFO - Custom CSS loaded successfully
46
+ 2025-07-13 23:21:32,219 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
47
+ 2025-07-13 23:21:43,012 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
48
+ 2025-07-13 23:21:43,013 - openai._base_client - INFO - Retrying request to /embeddings in 0.396331 seconds
49
+ 2025-07-13 23:21:44,678 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
50
+ 2025-07-13 23:21:44,680 - openai._base_client - INFO - Retrying request to /embeddings in 0.842338 seconds
51
+ 2025-07-13 23:21:47,127 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
52
+ 2025-07-13 23:21:47,128 - src.utils.text_processor - WARNING - OpenAI embeddings failed: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
53
+ 2025-07-13 23:21:47,129 - src.utils.text_processor - INFO - Using simple text-based fallback
54
+ 2025-07-13 23:21:47,129 - src.utils.text_processor - INFO - Created simple text-based fallback vector store
55
+ 2025-07-13 23:21:47,129 - src.utils.text_processor - INFO - Using simple fallback QA system
56
+ 2025-07-13 23:22:46,498 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
57
+ 2025-07-13 23:22:49,535 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
58
+ 2025-07-13 23:53:47,078 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
59
+ 2025-07-13 23:53:52,909 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
60
+ 2025-07-13 23:53:59,444 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
61
+ 2025-07-13 23:53:59,609 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
62
+ 2025-07-13 23:54:00,519 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
63
+ 2025-07-13 23:54:07,685 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
64
+ 2025-07-13 23:54:07,688 - openai._base_client - INFO - Retrying request to /embeddings in 0.454709 seconds
65
+ 2025-07-13 23:54:08,673 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
66
+ 2025-07-13 23:54:08,674 - openai._base_client - INFO - Retrying request to /embeddings in 0.918276 seconds
67
+ 2025-07-13 23:54:10,652 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
68
+ 2025-07-13 23:54:10,656 - src.utils.text_processor - WARNING - OpenAI embeddings failed: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
69
+ 2025-07-13 23:54:10,657 - src.utils.text_processor - INFO - Using simple text-based fallback
70
+ 2025-07-13 23:54:10,659 - src.utils.text_processor - INFO - Created simple text-based fallback vector store
71
+ 2025-07-13 23:54:10,660 - src.utils.text_processor - INFO - Using simple fallback QA system
72
+ 2025-07-13 23:54:22,185 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
73
+ 2025-07-13 23:54:24,094 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
74
+ 2025-07-14 00:42:29,448 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
75
+ 2025-07-14 00:42:47,515 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
76
+ 2025-07-14 00:42:47,673 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
77
+ 2025-07-14 00:42:48,260 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
78
+ 2025-07-14 00:42:52,222 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
79
+ 2025-07-14 00:42:52,224 - openai._base_client - INFO - Retrying request to /embeddings in 0.396998 seconds
80
+ 2025-07-14 00:42:53,417 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
81
+ 2025-07-14 00:42:53,419 - openai._base_client - INFO - Retrying request to /embeddings in 0.829603 seconds
82
+ 2025-07-14 00:42:54,708 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
83
+ 2025-07-14 00:42:54,713 - src.utils.text_processor - WARNING - OpenAI embeddings failed: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
84
+ 2025-07-14 00:42:54,715 - src.utils.text_processor - INFO - Using simple text-based fallback
85
+ 2025-07-14 00:42:54,717 - src.utils.text_processor - INFO - Created simple text-based fallback vector store
86
+ 2025-07-14 00:42:54,718 - src.utils.text_processor - INFO - Using simple fallback QA system
87
+ 2025-07-14 00:50:26,573 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
88
+ 2025-07-14 01:00:15,758 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
89
+ 2025-07-14 01:00:23,869 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
90
+ 2025-07-14 01:00:24,021 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
91
+ 2025-07-14 01:00:24,480 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
92
+ 2025-07-14 01:00:26,491 - src.utils.youtube_handler - INFO - Successfully got transcript in en
93
+ 2025-07-14 01:00:28,434 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
94
+ 2025-07-14 01:00:28,436 - openai._base_client - INFO - Retrying request to /embeddings in 0.464677 seconds
95
+ 2025-07-14 01:00:29,888 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
96
+ 2025-07-14 01:00:29,889 - openai._base_client - INFO - Retrying request to /embeddings in 0.932156 seconds
97
+ 2025-07-14 01:00:31,765 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 429 Too Many Requests"
98
+ 2025-07-14 01:00:31,768 - src.utils.text_processor - WARNING - OpenAI embeddings failed: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
99
+ 2025-07-14 01:00:31,770 - src.utils.text_processor - INFO - Using simple text-based fallback
100
+ 2025-07-14 01:00:31,771 - src.utils.text_processor - INFO - Created simple text-based fallback vector store
101
+ 2025-07-14 01:00:31,772 - src.utils.text_processor - INFO - Using simple fallback QA system
102
+ 2025-07-14 01:01:09,650 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
103
+ 2025-07-14 01:01:32,106 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
104
+ 2025-07-14 01:01:32,361 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
105
+ 2025-07-14 01:01:33,814 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
106
+ 2025-07-14 01:01:33,972 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
107
+ 2025-07-14 01:01:34,156 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
108
+ 2025-07-14 01:01:34,479 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
109
+ 2025-07-14 01:01:34,750 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
110
+ 2025-07-14 01:01:34,892 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
111
+ 2025-07-14 01:01:35,077 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
112
+ 2025-07-14 01:01:35,399 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
113
+ 2025-07-14 01:01:38,181 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
114
+ 2025-07-14 01:01:38,429 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
115
+ 2025-07-14 01:01:39,552 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
116
+ 2025-07-14 01:01:39,800 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
117
+ 2025-07-14 01:01:39,822 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
118
+ 2025-07-14 01:01:39,989 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
119
+ 2025-07-14 01:01:40,370 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
120
+ 2025-07-14 01:01:41,960 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
121
+ 2025-07-14 01:14:18,892 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
122
+ 2025-07-14 01:14:53,458 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
123
+ 2025-07-14 01:14:53,729 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
124
+ 2025-07-14 01:14:54,391 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
125
+ 2025-07-14 01:14:54,504 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
126
+ 2025-07-14 01:14:54,695 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
127
+ 2025-07-14 01:14:54,820 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
128
+ 2025-07-14 01:14:55,091 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
129
+ 2025-07-14 01:14:55,861 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
130
+ 2025-07-14 01:14:55,967 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
131
+ 2025-07-14 01:14:56,154 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
132
+ 2025-07-14 01:14:56,488 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
133
+ 2025-07-14 01:14:56,756 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
134
+ 2025-07-14 01:14:56,902 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
135
+ 2025-07-14 01:14:57,250 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
136
+ 2025-07-14 01:14:58,213 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
137
+ 2025-07-14 01:14:58,299 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
138
+ 2025-07-14 01:14:58,452 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
139
+ 2025-07-14 01:14:58,584 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
140
+ 2025-07-14 01:14:58,780 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
141
+ 2025-07-14 01:14:59,089 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
142
+ 2025-07-14 01:14:59,926 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
143
+ 2025-07-14 01:15:00,032 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
144
+ 2025-07-14 01:15:00,208 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
145
+ 2025-07-14 01:15:00,533 - src.utils.logger - INFO - Custom dark theme CSS loaded successfully
src/logs/youtube_chatbot_20250714.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-07-14 00:58:58,526 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
2
+ 2025-07-14 00:59:00,597 - src.utils.youtube_handler - INFO - Successfully got transcript in en
3
+ 2025-07-14 00:59:00,784 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
4
+ 2025-07-14 00:59:02,824 - src.utils.youtube_handler - INFO - Successfully got transcript in en
5
+ 2025-07-14 00:59:03,041 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
6
+ 2025-07-14 00:59:05,007 - src.utils.youtube_handler - INFO - Successfully got transcript in en
7
+ 2025-07-14 00:59:05,209 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
8
+ 2025-07-14 00:59:07,507 - src.utils.youtube_handler - INFO - Successfully got transcript in en
9
+ 2025-07-14 01:12:20,696 - src.utils.youtube_handler - ERROR - Error getting video metadata: HTTP Error 400: Bad Request
10
+ 2025-07-14 01:12:20,697 - src.utils.youtube_handler - INFO - Rate limiting: sleeping for 2.69 seconds
11
+ 2025-07-14 01:12:24,675 - src.utils.youtube_handler - INFO - Rate limiting: sleeping for 1.72 seconds
12
+ 2025-07-14 01:12:27,701 - src.utils.youtube_handler - INFO - Successfully got transcript in en on attempt 1
src/src/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # YouTube Transcript Chatbot Package
2
+ __version__ = "1.0.0"
3
+ __author__ = "YouTube Transcript Chatbot Team"
src/src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (226 Bytes). View file
 
src/src/utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Utilities package
src/src/utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (160 Bytes). View file
 
src/src/utils/__pycache__/export_utils.cpython-310.pyc ADDED
Binary file (7.4 kB). View file
 
src/src/utils/__pycache__/logger.cpython-310.pyc ADDED
Binary file (1.38 kB). View file
 
src/src/utils/__pycache__/session_manager.cpython-310.pyc ADDED
Binary file (6.49 kB). View file
 
src/src/utils/__pycache__/text_processor.cpython-310.pyc ADDED
Binary file (12.7 kB). View file
 
src/src/utils/__pycache__/youtube_handler.cpython-310.pyc ADDED
Binary file (10.8 kB). View file
 
src/src/utils/cache_manager.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cache management utilities for vector stores and processed data.
3
+ """
4
+
5
+ import os
6
+ import pickle
7
+ import hashlib
8
+ import logging
9
+ from datetime import datetime, timedelta
10
+ from typing import Any, Optional, Dict
11
+ from pathlib import Path
12
+ import shutil
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ class CacheManager:
17
+ """Manages caching of vector stores and processed data."""
18
+
19
+ def __init__(self, cache_dir: str = "cache", max_size_mb: int = 500):
20
+ """
21
+ Initialize cache manager.
22
+
23
+ Args:
24
+ cache_dir (str): Cache directory path
25
+ max_size_mb (int): Maximum cache size in MB
26
+ """
27
+ self.cache_dir = Path(cache_dir)
28
+ self.max_size_bytes = max_size_mb * 1024 * 1024
29
+ self.ensure_cache_directory()
30
+
31
+ def ensure_cache_directory(self):
32
+ """Ensure cache directory exists."""
33
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
34
+
35
+ # Create subdirectories
36
+ (self.cache_dir / "vectorstores").mkdir(exist_ok=True)
37
+ (self.cache_dir / "transcripts").mkdir(exist_ok=True)
38
+ (self.cache_dir / "metadata").mkdir(exist_ok=True)
39
+
40
+ def _get_cache_key(self, data: str) -> str:
41
+ """
42
+ Generate cache key from data.
43
+
44
+ Args:
45
+ data (str): Data to generate key for
46
+
47
+ Returns:
48
+ str: Cache key
49
+ """
50
+ return hashlib.md5(data.encode()).hexdigest()
51
+
52
+ def _get_cache_path(self, cache_type: str, key: str) -> Path:
53
+ """
54
+ Get cache file path.
55
+
56
+ Args:
57
+ cache_type (str): Type of cache (vectorstores, transcripts, metadata)
58
+ key (str): Cache key
59
+
60
+ Returns:
61
+ Path: Cache file path
62
+ """
63
+ return self.cache_dir / cache_type / f"{key}.pkl"
64
+
65
+ def save_vectorstore(self, video_id: str, vectorstore: Any) -> bool:
66
+ """
67
+ Save vector store to cache.
68
+
69
+ Args:
70
+ video_id (str): Video ID
71
+ vectorstore (Any): Vector store object
72
+
73
+ Returns:
74
+ bool: True if successful, False otherwise
75
+ """
76
+ try:
77
+ cache_key = self._get_cache_key(video_id)
78
+ cache_path = self._get_cache_path("vectorstores", cache_key)
79
+
80
+ # Save vector store using FAISS's built-in save method
81
+ vectorstore.save_local(str(cache_path.with_suffix("")))
82
+
83
+ # Save metadata
84
+ metadata = {
85
+ 'video_id': video_id,
86
+ 'created_at': datetime.now().isoformat(),
87
+ 'cache_key': cache_key
88
+ }
89
+
90
+ metadata_path = self._get_cache_path("metadata", cache_key)
91
+ with open(metadata_path, 'wb') as f:
92
+ pickle.dump(metadata, f)
93
+
94
+ logger.info(f"Vector store cached for video {video_id}")
95
+ self._cleanup_cache()
96
+ return True
97
+
98
+ except Exception as e:
99
+ logger.error(f"Error caching vector store: {e}")
100
+ return False
101
+
102
+ def load_vectorstore(self, video_id: str, embeddings: Any) -> Optional[Any]:
103
+ """
104
+ Load vector store from cache.
105
+
106
+ Args:
107
+ video_id (str): Video ID
108
+ embeddings (Any): Embeddings object for loading
109
+
110
+ Returns:
111
+ Optional[Any]: Vector store object or None if not found
112
+ """
113
+ try:
114
+ cache_key = self._get_cache_key(video_id)
115
+ cache_path = self._get_cache_path("vectorstores", cache_key)
116
+
117
+ if not cache_path.with_suffix("").exists():
118
+ return None
119
+
120
+ # Load vector store using FAISS's built-in load method
121
+ from langchain_community.vectorstores import FAISS
122
+ vectorstore = FAISS.load_local(str(cache_path.with_suffix("")), embeddings)
123
+
124
+ logger.info(f"Vector store loaded from cache for video {video_id}")
125
+ return vectorstore
126
+
127
+ except Exception as e:
128
+ logger.error(f"Error loading vector store from cache: {e}")
129
+ return None
130
+
131
+ def save_transcript(self, video_id: str, transcript_data: Dict[str, Any]) -> bool:
132
+ """
133
+ Save transcript data to cache.
134
+
135
+ Args:
136
+ video_id (str): Video ID
137
+ transcript_data (Dict[str, Any]): Transcript data
138
+
139
+ Returns:
140
+ bool: True if successful, False otherwise
141
+ """
142
+ try:
143
+ cache_key = self._get_cache_key(video_id)
144
+ cache_path = self._get_cache_path("transcripts", cache_key)
145
+
146
+ cache_data = {
147
+ 'video_id': video_id,
148
+ 'transcript_data': transcript_data,
149
+ 'created_at': datetime.now().isoformat(),
150
+ 'cache_key': cache_key
151
+ }
152
+
153
+ with open(cache_path, 'wb') as f:
154
+ pickle.dump(cache_data, f)
155
+
156
+ logger.info(f"Transcript cached for video {video_id}")
157
+ self._cleanup_cache()
158
+ return True
159
+
160
+ except Exception as e:
161
+ logger.error(f"Error caching transcript: {e}")
162
+ return False
163
+
164
+ def load_transcript(self, video_id: str) -> Optional[Dict[str, Any]]:
165
+ """
166
+ Load transcript data from cache.
167
+
168
+ Args:
169
+ video_id (str): Video ID
170
+
171
+ Returns:
172
+ Optional[Dict[str, Any]]: Transcript data or None if not found
173
+ """
174
+ try:
175
+ cache_key = self._get_cache_key(video_id)
176
+ cache_path = self._get_cache_path("transcripts", cache_key)
177
+
178
+ if not cache_path.exists():
179
+ return None
180
+
181
+ with open(cache_path, 'rb') as f:
182
+ cache_data = pickle.load(f)
183
+
184
+ logger.info(f"Transcript loaded from cache for video {video_id}")
185
+ return cache_data['transcript_data']
186
+
187
+ except Exception as e:
188
+ logger.error(f"Error loading transcript from cache: {e}")
189
+ return None
190
+
191
+ def is_cached(self, video_id: str, cache_type: str = "vectorstores") -> bool:
192
+ """
193
+ Check if data is cached for video.
194
+
195
+ Args:
196
+ video_id (str): Video ID
197
+ cache_type (str): Type of cache to check
198
+
199
+ Returns:
200
+ bool: True if cached, False otherwise
201
+ """
202
+ try:
203
+ cache_key = self._get_cache_key(video_id)
204
+
205
+ if cache_type == "vectorstores":
206
+ cache_path = self._get_cache_path("vectorstores", cache_key)
207
+ return cache_path.with_suffix("").exists()
208
+ else:
209
+ cache_path = self._get_cache_path(cache_type, cache_key)
210
+ return cache_path.exists()
211
+
212
+ except Exception as e:
213
+ logger.error(f"Error checking cache: {e}")
214
+ return False
215
+
216
+ def delete_cache(self, video_id: str) -> bool:
217
+ """
218
+ Delete cached data for video.
219
+
220
+ Args:
221
+ video_id (str): Video ID
222
+
223
+ Returns:
224
+ bool: True if successful, False otherwise
225
+ """
226
+ try:
227
+ cache_key = self._get_cache_key(video_id)
228
+
229
+ # Delete vector store cache
230
+ vectorstore_path = self._get_cache_path("vectorstores", cache_key)
231
+ if vectorstore_path.with_suffix("").exists():
232
+ shutil.rmtree(vectorstore_path.with_suffix(""))
233
+
234
+ # Delete transcript cache
235
+ transcript_path = self._get_cache_path("transcripts", cache_key)
236
+ if transcript_path.exists():
237
+ transcript_path.unlink()
238
+
239
+ # Delete metadata cache
240
+ metadata_path = self._get_cache_path("metadata", cache_key)
241
+ if metadata_path.exists():
242
+ metadata_path.unlink()
243
+
244
+ logger.info(f"Cache deleted for video {video_id}")
245
+ return True
246
+
247
+ except Exception as e:
248
+ logger.error(f"Error deleting cache: {e}")
249
+ return False
250
+
251
+ def get_cache_size(self) -> Dict[str, Any]:
252
+ """
253
+ Get cache size information.
254
+
255
+ Returns:
256
+ Dict[str, Any]: Cache size information
257
+ """
258
+ try:
259
+ total_size = 0
260
+ file_count = 0
261
+
262
+ for root, dirs, files in os.walk(self.cache_dir):
263
+ for file in files:
264
+ file_path = os.path.join(root, file)
265
+ if os.path.exists(file_path):
266
+ total_size += os.path.getsize(file_path)
267
+ file_count += 1
268
+
269
+ return {
270
+ 'total_size_bytes': total_size,
271
+ 'total_size_mb': round(total_size / (1024 * 1024), 2),
272
+ 'file_count': file_count,
273
+ 'max_size_mb': self.max_size_bytes / (1024 * 1024),
274
+ 'usage_percent': round((total_size / self.max_size_bytes) * 100, 2)
275
+ }
276
+
277
+ except Exception as e:
278
+ logger.error(f"Error getting cache size: {e}")
279
+ return {}
280
+
281
+ def _cleanup_cache(self):
282
+ """Clean up cache if it exceeds maximum size."""
283
+ try:
284
+ cache_info = self.get_cache_size()
285
+
286
+ if cache_info.get('total_size_bytes', 0) > self.max_size_bytes:
287
+ logger.info("Cache size exceeded, cleaning up...")
288
+
289
+ # Get all cache files with their modification times
290
+ cache_files = []
291
+ for root, dirs, files in os.walk(self.cache_dir):
292
+ for file in files:
293
+ file_path = os.path.join(root, file)
294
+ if os.path.exists(file_path):
295
+ mtime = os.path.getmtime(file_path)
296
+ cache_files.append((file_path, mtime))
297
+
298
+ # Sort by modification time (oldest first)
299
+ cache_files.sort(key=lambda x: x[1])
300
+
301
+ # Delete oldest files until under limit
302
+ current_size = cache_info.get('total_size_bytes', 0)
303
+ target_size = self.max_size_bytes * 0.8 # Clean to 80% of max
304
+
305
+ for file_path, _ in cache_files:
306
+ if current_size <= target_size:
307
+ break
308
+
309
+ try:
310
+ file_size = os.path.getsize(file_path)
311
+ os.remove(file_path)
312
+ current_size -= file_size
313
+ logger.debug(f"Deleted cache file: {file_path}")
314
+ except Exception as e:
315
+ logger.error(f"Error deleting cache file {file_path}: {e}")
316
+
317
+ logger.info("Cache cleanup completed")
318
+
319
+ except Exception as e:
320
+ logger.error(f"Error during cache cleanup: {e}")
321
+
322
+ def clear_all_cache(self) -> bool:
323
+ """
324
+ Clear all cached data.
325
+
326
+ Returns:
327
+ bool: True if successful, False otherwise
328
+ """
329
+ try:
330
+ if self.cache_dir.exists():
331
+ shutil.rmtree(self.cache_dir)
332
+ self.ensure_cache_directory()
333
+
334
+ logger.info("All cache cleared")
335
+ return True
336
+
337
+ except Exception as e:
338
+ logger.error(f"Error clearing cache: {e}")
339
+ return False
340
+
341
+ def get_cached_videos(self) -> List[Dict[str, Any]]:
342
+ """
343
+ Get list of cached videos.
344
+
345
+ Returns:
346
+ List[Dict[str, Any]]: List of cached video information
347
+ """
348
+ try:
349
+ cached_videos = []
350
+ metadata_dir = self.cache_dir / "metadata"
351
+
352
+ if not metadata_dir.exists():
353
+ return cached_videos
354
+
355
+ for metadata_file in metadata_dir.glob("*.pkl"):
356
+ try:
357
+ with open(metadata_file, 'rb') as f:
358
+ metadata = pickle.load(f)
359
+
360
+ cached_videos.append({
361
+ 'video_id': metadata.get('video_id'),
362
+ 'cache_key': metadata.get('cache_key'),
363
+ 'created_at': metadata.get('created_at'),
364
+ 'file_size': metadata_file.stat().st_size
365
+ })
366
+
367
+ except Exception as e:
368
+ logger.error(f"Error reading metadata file {metadata_file}: {e}")
369
+
370
+ return cached_videos
371
+
372
+ except Exception as e:
373
+ logger.error(f"Error getting cached videos: {e}")
374
+ return []
src/src/utils/database.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Database utilities for storing processed videos and conversations.
3
+ """
4
+
5
+ import sqlite3
6
+ import json
7
+ import logging
8
+ from datetime import datetime
9
+ from typing import Dict, List, Any, Optional
10
+ from pathlib import Path
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class DatabaseManager:
15
+ """Manages SQLite database operations for the chatbot."""
16
+
17
+ def __init__(self, db_path: str = "data/chatbot.db"):
18
+ """
19
+ Initialize database manager.
20
+
21
+ Args:
22
+ db_path (str): Path to SQLite database file
23
+ """
24
+ self.db_path = db_path
25
+ self.ensure_db_directory()
26
+ self.init_database()
27
+
28
+ def ensure_db_directory(self):
29
+ """Ensure database directory exists."""
30
+ db_dir = Path(self.db_path).parent
31
+ db_dir.mkdir(parents=True, exist_ok=True)
32
+
33
+ def init_database(self):
34
+ """Initialize database tables."""
35
+ try:
36
+ with sqlite3.connect(self.db_path) as conn:
37
+ cursor = conn.cursor()
38
+
39
+ # Videos table
40
+ cursor.execute('''
41
+ CREATE TABLE IF NOT EXISTS videos (
42
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
43
+ video_id TEXT UNIQUE NOT NULL,
44
+ url TEXT NOT NULL,
45
+ title TEXT,
46
+ author TEXT,
47
+ duration INTEGER,
48
+ views INTEGER,
49
+ publish_date TEXT,
50
+ thumbnail_url TEXT,
51
+ transcript TEXT,
52
+ metadata TEXT,
53
+ processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
54
+ language TEXT DEFAULT 'en'
55
+ )
56
+ ''')
57
+
58
+ # Conversations table
59
+ cursor.execute('''
60
+ CREATE TABLE IF NOT EXISTS conversations (
61
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
62
+ conversation_id TEXT NOT NULL,
63
+ video_id TEXT,
64
+ question TEXT NOT NULL,
65
+ answer TEXT NOT NULL,
66
+ source_documents TEXT,
67
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
68
+ FOREIGN KEY (video_id) REFERENCES videos (video_id)
69
+ )
70
+ ''')
71
+
72
+ # Vector stores table (for caching)
73
+ cursor.execute('''
74
+ CREATE TABLE IF NOT EXISTS vector_stores (
75
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
76
+ video_id TEXT UNIQUE NOT NULL,
77
+ vector_data BLOB,
78
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
79
+ FOREIGN KEY (video_id) REFERENCES videos (video_id)
80
+ )
81
+ ''')
82
+
83
+ # User sessions table
84
+ cursor.execute('''
85
+ CREATE TABLE IF NOT EXISTS user_sessions (
86
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
87
+ session_id TEXT UNIQUE NOT NULL,
88
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
89
+ last_activity TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
90
+ metadata TEXT
91
+ )
92
+ ''')
93
+
94
+ conn.commit()
95
+ logger.info("Database initialized successfully")
96
+
97
+ except Exception as e:
98
+ logger.error(f"Error initializing database: {e}")
99
+ raise
100
+
101
+ def save_video(self, video_data: Dict[str, Any]) -> bool:
102
+ """
103
+ Save video information to database.
104
+
105
+ Args:
106
+ video_data (Dict[str, Any]): Video data including metadata and transcript
107
+
108
+ Returns:
109
+ bool: True if successful, False otherwise
110
+ """
111
+ try:
112
+ with sqlite3.connect(self.db_path) as conn:
113
+ cursor = conn.cursor()
114
+
115
+ cursor.execute('''
116
+ INSERT OR REPLACE INTO videos
117
+ (video_id, url, title, author, duration, views, publish_date,
118
+ thumbnail_url, transcript, metadata, language)
119
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
120
+ ''', (
121
+ video_data.get('video_id'),
122
+ video_data.get('url'),
123
+ video_data.get('title'),
124
+ video_data.get('author'),
125
+ video_data.get('duration'),
126
+ video_data.get('views'),
127
+ video_data.get('publish_date'),
128
+ video_data.get('thumbnail_url'),
129
+ video_data.get('transcript'),
130
+ json.dumps(video_data.get('metadata', {})),
131
+ video_data.get('language', 'en')
132
+ ))
133
+
134
+ conn.commit()
135
+ return True
136
+
137
+ except Exception as e:
138
+ logger.error(f"Error saving video: {e}")
139
+ return False
140
+
141
+ def get_video(self, video_id: str) -> Optional[Dict[str, Any]]:
142
+ """
143
+ Get video information from database.
144
+
145
+ Args:
146
+ video_id (str): Video ID
147
+
148
+ Returns:
149
+ Optional[Dict[str, Any]]: Video data or None if not found
150
+ """
151
+ try:
152
+ with sqlite3.connect(self.db_path) as conn:
153
+ cursor = conn.cursor()
154
+
155
+ cursor.execute('''
156
+ SELECT video_id, url, title, author, duration, views,
157
+ publish_date, thumbnail_url, transcript, metadata,
158
+ processed_at, language
159
+ FROM videos WHERE video_id = ?
160
+ ''', (video_id,))
161
+
162
+ row = cursor.fetchone()
163
+ if row:
164
+ return {
165
+ 'video_id': row[0],
166
+ 'url': row[1],
167
+ 'title': row[2],
168
+ 'author': row[3],
169
+ 'duration': row[4],
170
+ 'views': row[5],
171
+ 'publish_date': row[6],
172
+ 'thumbnail_url': row[7],
173
+ 'transcript': row[8],
174
+ 'metadata': json.loads(row[9]) if row[9] else {},
175
+ 'processed_at': row[10],
176
+ 'language': row[11]
177
+ }
178
+
179
+ except Exception as e:
180
+ logger.error(f"Error getting video: {e}")
181
+
182
+ return None
183
+
184
+ def save_conversation(self, conversation_data: Dict[str, Any]) -> bool:
185
+ """
186
+ Save conversation entry to database.
187
+
188
+ Args:
189
+ conversation_data (Dict[str, Any]): Conversation data
190
+
191
+ Returns:
192
+ bool: True if successful, False otherwise
193
+ """
194
+ try:
195
+ with sqlite3.connect(self.db_path) as conn:
196
+ cursor = conn.cursor()
197
+
198
+ cursor.execute('''
199
+ INSERT INTO conversations
200
+ (conversation_id, video_id, question, answer, source_documents)
201
+ VALUES (?, ?, ?, ?, ?)
202
+ ''', (
203
+ conversation_data.get('conversation_id'),
204
+ conversation_data.get('video_id'),
205
+ conversation_data.get('question'),
206
+ conversation_data.get('answer'),
207
+ json.dumps(conversation_data.get('source_documents', []))
208
+ ))
209
+
210
+ conn.commit()
211
+ return True
212
+
213
+ except Exception as e:
214
+ logger.error(f"Error saving conversation: {e}")
215
+ return False
216
+
217
+ def get_conversations(self, video_id: str = None, conversation_id: str = None,
218
+ limit: int = 100) -> List[Dict[str, Any]]:
219
+ """
220
+ Get conversations from database.
221
+
222
+ Args:
223
+ video_id (str): Optional video ID filter
224
+ conversation_id (str): Optional conversation ID filter
225
+ limit (int): Maximum number of results
226
+
227
+ Returns:
228
+ List[Dict[str, Any]]: List of conversations
229
+ """
230
+ try:
231
+ with sqlite3.connect(self.db_path) as conn:
232
+ cursor = conn.cursor()
233
+
234
+ query = '''
235
+ SELECT conversation_id, video_id, question, answer,
236
+ source_documents, created_at
237
+ FROM conversations
238
+ '''
239
+ params = []
240
+
241
+ conditions = []
242
+ if video_id:
243
+ conditions.append('video_id = ?')
244
+ params.append(video_id)
245
+
246
+ if conversation_id:
247
+ conditions.append('conversation_id = ?')
248
+ params.append(conversation_id)
249
+
250
+ if conditions:
251
+ query += ' WHERE ' + ' AND '.join(conditions)
252
+
253
+ query += ' ORDER BY created_at DESC LIMIT ?'
254
+ params.append(limit)
255
+
256
+ cursor.execute(query, params)
257
+ rows = cursor.fetchall()
258
+
259
+ conversations = []
260
+ for row in rows:
261
+ conversations.append({
262
+ 'conversation_id': row[0],
263
+ 'video_id': row[1],
264
+ 'question': row[2],
265
+ 'answer': row[3],
266
+ 'source_documents': json.loads(row[4]) if row[4] else [],
267
+ 'created_at': row[5]
268
+ })
269
+
270
+ return conversations
271
+
272
+ except Exception as e:
273
+ logger.error(f"Error getting conversations: {e}")
274
+ return []
275
+
276
+ def get_processed_videos(self, limit: int = 50) -> List[Dict[str, Any]]:
277
+ """
278
+ Get list of processed videos.
279
+
280
+ Args:
281
+ limit (int): Maximum number of results
282
+
283
+ Returns:
284
+ List[Dict[str, Any]]: List of processed videos
285
+ """
286
+ try:
287
+ with sqlite3.connect(self.db_path) as conn:
288
+ cursor = conn.cursor()
289
+
290
+ cursor.execute('''
291
+ SELECT video_id, title, author, duration, processed_at
292
+ FROM videos
293
+ ORDER BY processed_at DESC
294
+ LIMIT ?
295
+ ''', (limit,))
296
+
297
+ rows = cursor.fetchall()
298
+
299
+ videos = []
300
+ for row in rows:
301
+ videos.append({
302
+ 'video_id': row[0],
303
+ 'title': row[1],
304
+ 'author': row[2],
305
+ 'duration': row[3],
306
+ 'processed_at': row[4]
307
+ })
308
+
309
+ return videos
310
+
311
+ except Exception as e:
312
+ logger.error(f"Error getting processed videos: {e}")
313
+ return []
314
+
315
+ def delete_video(self, video_id: str) -> bool:
316
+ """
317
+ Delete video and associated conversations.
318
+
319
+ Args:
320
+ video_id (str): Video ID to delete
321
+
322
+ Returns:
323
+ bool: True if successful, False otherwise
324
+ """
325
+ try:
326
+ with sqlite3.connect(self.db_path) as conn:
327
+ cursor = conn.cursor()
328
+
329
+ # Delete conversations first (foreign key constraint)
330
+ cursor.execute('DELETE FROM conversations WHERE video_id = ?', (video_id,))
331
+ cursor.execute('DELETE FROM vector_stores WHERE video_id = ?', (video_id,))
332
+ cursor.execute('DELETE FROM videos WHERE video_id = ?', (video_id,))
333
+
334
+ conn.commit()
335
+ return True
336
+
337
+ except Exception as e:
338
+ logger.error(f"Error deleting video: {e}")
339
+ return False
340
+
341
+ def get_database_stats(self) -> Dict[str, Any]:
342
+ """
343
+ Get database statistics.
344
+
345
+ Returns:
346
+ Dict[str, Any]: Database statistics
347
+ """
348
+ try:
349
+ with sqlite3.connect(self.db_path) as conn:
350
+ cursor = conn.cursor()
351
+
352
+ # Count videos
353
+ cursor.execute('SELECT COUNT(*) FROM videos')
354
+ video_count = cursor.fetchone()[0]
355
+
356
+ # Count conversations
357
+ cursor.execute('SELECT COUNT(*) FROM conversations')
358
+ conversation_count = cursor.fetchone()[0]
359
+
360
+ # Get database size
361
+ cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
362
+ db_size = cursor.fetchone()[0]
363
+
364
+ return {
365
+ 'total_videos': video_count,
366
+ 'total_conversations': conversation_count,
367
+ 'database_size_bytes': db_size,
368
+ 'database_size_mb': round(db_size / (1024 * 1024), 2)
369
+ }
370
+
371
+ except Exception as e:
372
+ logger.error(f"Error getting database stats: {e}")
373
+ return {}
src/src/utils/export_utils.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Export utilities for generating PDF, text, and other format exports.
3
+ """
4
+
5
+ import io
6
+ import json
7
+ import logging
8
+ from datetime import datetime
9
+ from typing import List, Dict, Any, Optional
10
+ from reportlab.lib.pagesizes import letter, A4
11
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
12
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
13
+ from reportlab.lib.units import inch
14
+ from reportlab.lib.colors import HexColor
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class ExportUtils:
19
+ """Utilities for exporting chat history and transcripts in various formats."""
20
+
21
+ def __init__(self):
22
+ self.styles = getSampleStyleSheet()
23
+ self.setup_custom_styles()
24
+
25
+ def setup_custom_styles(self):
26
+ """Setup custom styles for PDF generation."""
27
+ self.styles.add(ParagraphStyle(
28
+ name='CustomTitle',
29
+ parent=self.styles['Heading1'],
30
+ fontSize=16,
31
+ spaceAfter=30,
32
+ textColor=HexColor('#2E86AB')
33
+ ))
34
+
35
+ self.styles.add(ParagraphStyle(
36
+ name='QuestionStyle',
37
+ parent=self.styles['Normal'],
38
+ fontSize=12,
39
+ spaceAfter=10,
40
+ textColor=HexColor('#A23B72'),
41
+ leftIndent=20
42
+ ))
43
+
44
+ self.styles.add(ParagraphStyle(
45
+ name='AnswerStyle',
46
+ parent=self.styles['Normal'],
47
+ fontSize=11,
48
+ spaceAfter=20,
49
+ leftIndent=40
50
+ ))
51
+
52
+ def export_to_pdf(self, chat_history: List[Dict[str, Any]],
53
+ video_metadata: Dict[str, Any] = None) -> bytes:
54
+ """
55
+ Export chat history to PDF format.
56
+
57
+ Args:
58
+ chat_history (List[Dict[str, Any]]): Chat history entries
59
+ video_metadata (Dict[str, Any]): Video metadata
60
+
61
+ Returns:
62
+ bytes: PDF content as bytes
63
+ """
64
+ try:
65
+ buffer = io.BytesIO()
66
+ doc = SimpleDocTemplate(buffer, pagesize=A4)
67
+ story = []
68
+
69
+ # Title
70
+ title = "YouTube Transcript Q&A Session"
71
+ story.append(Paragraph(title, self.styles['CustomTitle']))
72
+ story.append(Spacer(1, 12))
73
+
74
+ # Video information
75
+ if video_metadata:
76
+ story.append(Paragraph("Video Information", self.styles['Heading2']))
77
+ story.append(Paragraph(f"<b>Title:</b> {video_metadata.get('title', 'N/A')}",
78
+ self.styles['Normal']))
79
+ story.append(Paragraph(f"<b>Author:</b> {video_metadata.get('author', 'N/A')}",
80
+ self.styles['Normal']))
81
+ story.append(Paragraph(f"<b>Duration:</b> {self._format_duration(video_metadata.get('length', 0))}",
82
+ self.styles['Normal']))
83
+ story.append(Spacer(1, 20))
84
+
85
+ # Export information
86
+ story.append(Paragraph(f"<b>Exported on:</b> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
87
+ self.styles['Normal']))
88
+ story.append(Paragraph(f"<b>Total Questions:</b> {len(chat_history)}",
89
+ self.styles['Normal']))
90
+ story.append(Spacer(1, 20))
91
+
92
+ # Chat history
93
+ story.append(Paragraph("Questions and Answers", self.styles['Heading2']))
94
+ story.append(Spacer(1, 12))
95
+
96
+ for i, entry in enumerate(chat_history, 1):
97
+ # Question
98
+ story.append(Paragraph(f"<b>Q{i}:</b> {entry['question']}",
99
+ self.styles['QuestionStyle']))
100
+
101
+ # Answer
102
+ story.append(Paragraph(f"<b>A{i}:</b> {entry['answer']}",
103
+ self.styles['AnswerStyle']))
104
+
105
+ # Timestamp
106
+ timestamp = datetime.fromisoformat(entry['timestamp']).strftime('%Y-%m-%d %H:%M:%S')
107
+ story.append(Paragraph(f"<i>Asked on: {timestamp}</i>",
108
+ self.styles['Normal']))
109
+ story.append(Spacer(1, 15))
110
+
111
+ doc.build(story)
112
+ buffer.seek(0)
113
+ return buffer.getvalue()
114
+
115
+ except Exception as e:
116
+ logger.error(f"Error generating PDF: {e}")
117
+ return b""
118
+
119
+ def export_to_text(self, chat_history: List[Dict[str, Any]],
120
+ video_metadata: Dict[str, Any] = None) -> str:
121
+ """
122
+ Export chat history to plain text format.
123
+
124
+ Args:
125
+ chat_history (List[Dict[str, Any]]): Chat history entries
126
+ video_metadata (Dict[str, Any]): Video metadata
127
+
128
+ Returns:
129
+ str: Text content
130
+ """
131
+ try:
132
+ lines = []
133
+ lines.append("YouTube Transcript Q&A Session")
134
+ lines.append("=" * 50)
135
+ lines.append("")
136
+
137
+ # Video information
138
+ if video_metadata:
139
+ lines.append("Video Information:")
140
+ lines.append(f"Title: {video_metadata.get('title', 'N/A')}")
141
+ lines.append(f"Author: {video_metadata.get('author', 'N/A')}")
142
+ lines.append(f"Duration: {self._format_duration(video_metadata.get('length', 0))}")
143
+ lines.append("")
144
+
145
+ # Export information
146
+ lines.append(f"Exported on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
147
+ lines.append(f"Total Questions: {len(chat_history)}")
148
+ lines.append("")
149
+ lines.append("Questions and Answers:")
150
+ lines.append("-" * 30)
151
+ lines.append("")
152
+
153
+ # Chat history
154
+ for i, entry in enumerate(chat_history, 1):
155
+ timestamp = datetime.fromisoformat(entry['timestamp']).strftime('%Y-%m-%d %H:%M:%S')
156
+ lines.append(f"Q{i}: {entry['question']}")
157
+ lines.append(f"A{i}: {entry['answer']}")
158
+ lines.append(f"Asked on: {timestamp}")
159
+ lines.append("")
160
+ lines.append("-" * 30)
161
+ lines.append("")
162
+
163
+ return "\n".join(lines)
164
+
165
+ except Exception as e:
166
+ logger.error(f"Error generating text export: {e}")
167
+ return ""
168
+
169
+ def export_to_json(self, chat_history: List[Dict[str, Any]],
170
+ video_metadata: Dict[str, Any] = None) -> str:
171
+ """
172
+ Export chat history to JSON format.
173
+
174
+ Args:
175
+ chat_history (List[Dict[str, Any]]): Chat history entries
176
+ video_metadata (Dict[str, Any]): Video metadata
177
+
178
+ Returns:
179
+ str: JSON content
180
+ """
181
+ try:
182
+ export_data = {
183
+ 'export_info': {
184
+ 'exported_at': datetime.now().isoformat(),
185
+ 'total_questions': len(chat_history),
186
+ 'format_version': '1.0'
187
+ },
188
+ 'video_metadata': video_metadata or {},
189
+ 'chat_history': chat_history
190
+ }
191
+
192
+ return json.dumps(export_data, indent=2, ensure_ascii=False)
193
+
194
+ except Exception as e:
195
+ logger.error(f"Error generating JSON export: {e}")
196
+ return ""
197
+
198
+ def export_transcript(self, transcript_text: str, video_metadata: Dict[str, Any] = None,
199
+ format: str = 'txt') -> str:
200
+ """
201
+ Export transcript in specified format.
202
+
203
+ Args:
204
+ transcript_text (str): Transcript text
205
+ video_metadata (Dict[str, Any]): Video metadata
206
+ format (str): Export format ('txt', 'json')
207
+
208
+ Returns:
209
+ str: Exported transcript
210
+ """
211
+ try:
212
+ if format == 'txt':
213
+ lines = []
214
+ lines.append("YouTube Video Transcript")
215
+ lines.append("=" * 30)
216
+ lines.append("")
217
+
218
+ if video_metadata:
219
+ lines.append(f"Title: {video_metadata.get('title', 'N/A')}")
220
+ lines.append(f"Author: {video_metadata.get('author', 'N/A')}")
221
+ lines.append(f"Duration: {self._format_duration(video_metadata.get('length', 0))}")
222
+ lines.append("")
223
+
224
+ lines.append(f"Exported on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
225
+ lines.append("")
226
+ lines.append("Transcript:")
227
+ lines.append("-" * 20)
228
+ lines.append("")
229
+ lines.append(transcript_text)
230
+
231
+ return "\n".join(lines)
232
+
233
+ elif format == 'json':
234
+ export_data = {
235
+ 'export_info': {
236
+ 'exported_at': datetime.now().isoformat(),
237
+ 'format_version': '1.0'
238
+ },
239
+ 'video_metadata': video_metadata or {},
240
+ 'transcript': transcript_text
241
+ }
242
+ return json.dumps(export_data, indent=2, ensure_ascii=False)
243
+
244
+ return transcript_text
245
+
246
+ except Exception as e:
247
+ logger.error(f"Error exporting transcript: {e}")
248
+ return ""
249
+
250
+ def _format_duration(self, seconds: int) -> str:
251
+ """Format duration from seconds to HH:MM:SS format."""
252
+ if not seconds:
253
+ return "N/A"
254
+
255
+ hours = seconds // 3600
256
+ minutes = (seconds % 3600) // 60
257
+ seconds = seconds % 60
258
+
259
+ if hours > 0:
260
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
261
+ else:
262
+ return f"{minutes:02d}:{seconds:02d}"
src/src/utils/logger.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Logging configuration and utilities.
3
+ """
4
+
5
+ import logging
6
+ import os
7
+ from datetime import datetime
8
+ from logging.handlers import RotatingFileHandler
9
+
10
+ def setup_logging(log_level: str = "INFO", log_file: str = None) -> logging.Logger:
11
+ """
12
+ Setup logging configuration.
13
+
14
+ Args:
15
+ log_level (str): Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
16
+ log_file (str): Optional log file path
17
+
18
+ Returns:
19
+ logging.Logger: Configured logger
20
+ """
21
+ # Create logs directory if it doesn't exist
22
+ if log_file:
23
+ os.makedirs(os.path.dirname(log_file), exist_ok=True)
24
+ else:
25
+ os.makedirs('logs', exist_ok=True)
26
+ log_file = f'logs/youtube_chatbot_{datetime.now().strftime("%Y%m%d")}.log'
27
+
28
+ # Configure logging
29
+ logging.basicConfig(
30
+ level=getattr(logging, log_level.upper()),
31
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
32
+ handlers=[
33
+ logging.StreamHandler(), # Console output
34
+ RotatingFileHandler(
35
+ log_file,
36
+ maxBytes=10*1024*1024, # 10MB
37
+ backupCount=5
38
+ )
39
+ ]
40
+ )
41
+
42
+ return logging.getLogger(__name__)
43
+
44
+ def get_logger(name: str) -> logging.Logger:
45
+ """Get a logger with the specified name."""
46
+ return logging.getLogger(name)
src/src/utils/session_manager.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Session management utilities for handling chat history and application state.
3
+ """
4
+
5
+ import json
6
+ import logging
7
+ from datetime import datetime
8
+ from typing import Dict, List, Any, Optional
9
+ import streamlit as st
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class SessionManager:
14
+ """Manages session state, chat history, and conversation persistence."""
15
+
16
+ def __init__(self):
17
+ self.initialize_session_state()
18
+
19
+ def initialize_session_state(self):
20
+ """Initialize Streamlit session state variables."""
21
+ if 'chat_history' not in st.session_state:
22
+ st.session_state.chat_history = []
23
+
24
+ if 'processed_videos' not in st.session_state:
25
+ st.session_state.processed_videos = {}
26
+
27
+ if 'current_video' not in st.session_state:
28
+ st.session_state.current_video = None
29
+
30
+ if 'qa_chain' not in st.session_state:
31
+ st.session_state.qa_chain = None
32
+
33
+ if 'vectorstore' not in st.session_state:
34
+ st.session_state.vectorstore = None
35
+
36
+ if 'video_metadata' not in st.session_state:
37
+ st.session_state.video_metadata = {}
38
+
39
+ if 'conversation_id' not in st.session_state:
40
+ st.session_state.conversation_id = self.generate_conversation_id()
41
+
42
+ def generate_conversation_id(self) -> str:
43
+ """Generate a unique conversation ID."""
44
+ return f"conv_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
45
+
46
+ def add_to_chat_history(self, question: str, answer: str, video_id: str = None,
47
+ source_docs: List[Any] = None):
48
+ """
49
+ Add a Q&A pair to chat history.
50
+
51
+ Args:
52
+ question (str): User question
53
+ answer (str): AI answer
54
+ video_id (str): Associated video ID
55
+ source_docs (List[Any]): Source documents used for answer
56
+ """
57
+ chat_entry = {
58
+ 'timestamp': datetime.now().isoformat(),
59
+ 'question': question,
60
+ 'answer': answer,
61
+ 'video_id': video_id,
62
+ 'source_docs': [doc.page_content[:200] + "..." if len(doc.page_content) > 200
63
+ else doc.page_content for doc in (source_docs or [])],
64
+ 'conversation_id': st.session_state.conversation_id
65
+ }
66
+
67
+ st.session_state.chat_history.append(chat_entry)
68
+
69
+ def get_chat_history(self, video_id: str = None) -> List[Dict[str, Any]]:
70
+ """
71
+ Get chat history, optionally filtered by video ID.
72
+
73
+ Args:
74
+ video_id (str): Optional video ID to filter by
75
+
76
+ Returns:
77
+ List[Dict[str, Any]]: Chat history entries
78
+ """
79
+ if video_id:
80
+ return [entry for entry in st.session_state.chat_history
81
+ if entry.get('video_id') == video_id]
82
+ return st.session_state.chat_history
83
+
84
+ def clear_chat_history(self, video_id: str = None):
85
+ """
86
+ Clear chat history, optionally for a specific video.
87
+
88
+ Args:
89
+ video_id (str): Optional video ID to clear history for
90
+ """
91
+ if video_id:
92
+ st.session_state.chat_history = [
93
+ entry for entry in st.session_state.chat_history
94
+ if entry.get('video_id') != video_id
95
+ ]
96
+ else:
97
+ st.session_state.chat_history = []
98
+
99
+ def save_processed_video(self, video_url: str, video_id: str, metadata: Dict[str, Any],
100
+ transcript: str, qa_chain: Any, vectorstore: Any):
101
+ """
102
+ Save processed video information to session state.
103
+
104
+ Args:
105
+ video_url (str): Video URL
106
+ video_id (str): Video ID
107
+ metadata (Dict[str, Any]): Video metadata
108
+ transcript (str): Video transcript
109
+ qa_chain (Any): QA chain object
110
+ vectorstore (Any): Vector store object
111
+ """
112
+ st.session_state.processed_videos[video_id] = {
113
+ 'url': video_url,
114
+ 'metadata': metadata,
115
+ 'transcript': transcript,
116
+ 'processed_at': datetime.now().isoformat(),
117
+ 'conversation_id': st.session_state.conversation_id
118
+ }
119
+
120
+ st.session_state.current_video = video_id
121
+ st.session_state.qa_chain = qa_chain
122
+ st.session_state.vectorstore = vectorstore
123
+ st.session_state.video_metadata = metadata
124
+
125
+ def get_processed_videos(self) -> Dict[str, Dict[str, Any]]:
126
+ """Get all processed videos."""
127
+ return st.session_state.processed_videos
128
+
129
+ def switch_to_video(self, video_id: str) -> bool:
130
+ """
131
+ Switch to a previously processed video.
132
+
133
+ Args:
134
+ video_id (str): Video ID to switch to
135
+
136
+ Returns:
137
+ bool: True if successful, False if video not found
138
+ """
139
+ if video_id in st.session_state.processed_videos:
140
+ st.session_state.current_video = video_id
141
+ # Note: QA chain and vectorstore would need to be recreated
142
+ # This is a simplified version - in a full implementation,
143
+ # you'd want to persist and reload these objects
144
+ return True
145
+ return False
146
+
147
+ def export_chat_history(self, format: str = 'json') -> str:
148
+ """
149
+ Export chat history in specified format.
150
+
151
+ Args:
152
+ format (str): Export format ('json', 'txt')
153
+
154
+ Returns:
155
+ str: Exported chat history
156
+ """
157
+ if format == 'json':
158
+ return json.dumps(st.session_state.chat_history, indent=2)
159
+
160
+ elif format == 'txt':
161
+ output = []
162
+ for entry in st.session_state.chat_history:
163
+ output.append(f"Timestamp: {entry['timestamp']}")
164
+ output.append(f"Question: {entry['question']}")
165
+ output.append(f"Answer: {entry['answer']}")
166
+ if entry.get('video_id'):
167
+ output.append(f"Video ID: {entry['video_id']}")
168
+ output.append("-" * 50)
169
+ return "\n".join(output)
170
+
171
+ return ""
172
+
173
+ def get_session_stats(self) -> Dict[str, Any]:
174
+ """Get session statistics."""
175
+ return {
176
+ 'total_questions': len(st.session_state.chat_history),
177
+ 'processed_videos': len(st.session_state.processed_videos),
178
+ 'current_video': st.session_state.current_video,
179
+ 'conversation_id': st.session_state.conversation_id,
180
+ 'session_start': min([entry['timestamp'] for entry in st.session_state.chat_history],
181
+ default=datetime.now().isoformat())
182
+ }
src/src/utils/text_processor.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text processing utilities for document handling and vector store operations.
3
+ """
4
+
5
+ import os
6
+ import logging
7
+ from typing import List, Optional, Dict, Any
8
+ try:
9
+ from langchain_openai import OpenAIEmbeddings, OpenAI
10
+ except ImportError:
11
+ from langchain_community.embeddings import OpenAIEmbeddings
12
+ from langchain_community.llms import OpenAI
13
+ from langchain_community.vectorstores import FAISS
14
+ from langchain_community.document_loaders import TextLoader
15
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
16
+ from langchain.chains import RetrievalQA
17
+ from langchain.docstore.document import Document
18
+ import pickle
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ class SimpleVectorStore:
23
+ """
24
+ Simple text-based vector store that works without embeddings.
25
+ Uses basic text search and keyword matching.
26
+ """
27
+
28
+ def __init__(self, documents: List[Document]):
29
+ self.documents = documents
30
+ self.texts = [doc.page_content for doc in documents]
31
+
32
+ def as_retriever(self, search_type: str = "similarity", search_kwargs: dict = None):
33
+ """Return a simple retriever."""
34
+ return SimpleRetriever(self.documents, search_kwargs or {})
35
+
36
+ class SimpleRetriever:
37
+ """Simple text-based retriever."""
38
+
39
+ def __init__(self, documents: List[Document], search_kwargs: dict):
40
+ self.documents = documents
41
+ self.k = search_kwargs.get('k', 4)
42
+
43
+ def get_relevant_documents(self, query: str) -> List[Document]:
44
+ """Get relevant documents using simple text matching."""
45
+ query_words = query.lower().split()
46
+ scored_docs = []
47
+
48
+ for doc in self.documents:
49
+ content = doc.page_content.lower()
50
+ score = sum(1 for word in query_words if word in content)
51
+ if score > 0:
52
+ scored_docs.append((doc, score))
53
+
54
+ # Sort by score and return top k
55
+ scored_docs.sort(key=lambda x: x[1], reverse=True)
56
+ return [doc for doc, _ in scored_docs[:self.k]]
57
+
58
+ class FallbackQAChain:
59
+ """
60
+ Fallback QA chain that works without OpenAI API.
61
+ Provides basic text search and simple answers.
62
+ """
63
+
64
+ def __init__(self, vectorstore):
65
+ self.vectorstore = vectorstore
66
+ self.documents = []
67
+
68
+ # Extract documents from vectorstore
69
+ try:
70
+ if isinstance(vectorstore, SimpleVectorStore):
71
+ self.documents = vectorstore.documents
72
+ elif hasattr(vectorstore, 'docstore') and hasattr(vectorstore.docstore, '_dict'):
73
+ self.documents = list(vectorstore.docstore._dict.values())
74
+ except:
75
+ pass
76
+
77
+ def __call__(self, inputs: Dict[str, str]) -> Dict[str, Any]:
78
+ """
79
+ Process a query and return an answer.
80
+
81
+ Args:
82
+ inputs (Dict[str, str]): Input dictionary with 'query' key
83
+
84
+ Returns:
85
+ Dict[str, Any]: Result dictionary with 'result' and 'source_documents'
86
+ """
87
+ query = inputs.get('query', '').lower()
88
+
89
+ # Simple keyword-based search
90
+ relevant_docs = []
91
+ for doc in self.documents:
92
+ if hasattr(doc, 'page_content'):
93
+ content = doc.page_content.lower()
94
+ # Simple relevance scoring based on keyword matches
95
+ query_words = query.split()
96
+ matches = sum(1 for word in query_words if word in content)
97
+ if matches > 0:
98
+ relevant_docs.append((doc, matches))
99
+
100
+ # Sort by relevance and take top results
101
+ relevant_docs.sort(key=lambda x: x[1], reverse=True)
102
+ top_docs = [doc for doc, _ in relevant_docs[:3]]
103
+
104
+ # Generate simple answer
105
+ if top_docs:
106
+ # Combine relevant text
107
+ combined_text = " ".join([doc.page_content[:200] for doc in top_docs])
108
+ answer = f"Based on the transcript, here's what I found: {combined_text[:500]}..."
109
+ else:
110
+ answer = "I couldn't find specific information about that in the transcript. Please try rephrasing your question or ask about different topics covered in the video."
111
+
112
+ return {
113
+ 'result': answer,
114
+ 'source_documents': top_docs
115
+ }
116
+
117
+ class TextProcessor:
118
+ """Handles text processing, document splitting, and vector store operations."""
119
+
120
+ def __init__(self, openai_api_key: str):
121
+ """
122
+ Initialize TextProcessor with OpenAI API key.
123
+
124
+ Args:
125
+ openai_api_key (str): OpenAI API key
126
+ """
127
+ self.openai_api_key = openai_api_key
128
+ self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
129
+ self.llm = OpenAI(openai_api_key=openai_api_key, temperature=0.7)
130
+
131
+ def create_documents_from_text(self, text: str, metadata: Dict[str, Any] = None) -> List[Document]:
132
+ """
133
+ Create LangChain documents from text with metadata.
134
+
135
+ Args:
136
+ text (str): Input text
137
+ metadata (Dict[str, Any]): Document metadata
138
+
139
+ Returns:
140
+ List[Document]: List of LangChain documents
141
+ """
142
+ if metadata is None:
143
+ metadata = {}
144
+
145
+ # Use RecursiveCharacterTextSplitter for better text splitting
146
+ text_splitter = RecursiveCharacterTextSplitter(
147
+ chunk_size=1000,
148
+ chunk_overlap=200,
149
+ length_function=len,
150
+ separators=["\n\n", "\n", " ", ""]
151
+ )
152
+
153
+ # Create a document and split it
154
+ doc = Document(page_content=text, metadata=metadata)
155
+ docs = text_splitter.split_documents([doc])
156
+
157
+ return docs
158
+
159
+ def create_vector_store(self, documents: List[Document]) -> Optional[FAISS]:
160
+ """
161
+ Create FAISS vector store from documents with fallback options.
162
+
163
+ Args:
164
+ documents (List[Document]): List of documents
165
+
166
+ Returns:
167
+ Optional[FAISS]: FAISS vector store or None if failed
168
+ """
169
+ try:
170
+ if not documents:
171
+ logger.error("No documents provided for vector store creation")
172
+ return None
173
+
174
+ # Try with OpenAI embeddings first
175
+ try:
176
+ vectorstore = FAISS.from_documents(documents, self.embeddings)
177
+ return vectorstore
178
+ except Exception as openai_error:
179
+ logger.warning(f"OpenAI embeddings failed: {openai_error}")
180
+
181
+ # Fallback to simple text-based search
182
+ logger.info("Using simple text-based fallback")
183
+ return self._create_simple_fallback_store(documents)
184
+
185
+ except Exception as e:
186
+ logger.error(f"Error creating vector store: {e}")
187
+ return None
188
+
189
+ def _create_simple_fallback_store(self, documents: List[Document]) -> Optional['SimpleVectorStore']:
190
+ """
191
+ Create a simple fallback vector store using basic text search.
192
+
193
+ Args:
194
+ documents (List[Document]): List of documents
195
+
196
+ Returns:
197
+ Optional[SimpleVectorStore]: Simple vector store or None if failed
198
+ """
199
+ try:
200
+ # Create simple text-based vector store
201
+ simple_store = SimpleVectorStore(documents)
202
+ logger.info("Created simple text-based fallback vector store")
203
+ return simple_store
204
+ except Exception as e:
205
+ logger.error(f"Even fallback vector store creation failed: {e}")
206
+ return None
207
+
208
+ def save_vector_store(self, vectorstore: FAISS, path: str) -> bool:
209
+ """
210
+ Save vector store to disk.
211
+
212
+ Args:
213
+ vectorstore (FAISS): Vector store to save
214
+ path (str): Path to save the vector store
215
+
216
+ Returns:
217
+ bool: True if successful, False otherwise
218
+ """
219
+ try:
220
+ os.makedirs(os.path.dirname(path) if os.path.dirname(path) else '.', exist_ok=True)
221
+ vectorstore.save_local(path)
222
+ return True
223
+ except Exception as e:
224
+ logger.error(f"Error saving vector store: {e}")
225
+ return False
226
+
227
+ def load_vector_store(self, path: str) -> Optional[FAISS]:
228
+ """
229
+ Load vector store from disk.
230
+
231
+ Args:
232
+ path (str): Path to load the vector store from
233
+
234
+ Returns:
235
+ Optional[FAISS]: Loaded vector store or None if failed
236
+ """
237
+ try:
238
+ if not os.path.exists(path):
239
+ logger.error(f"Vector store path does not exist: {path}")
240
+ return None
241
+
242
+ vectorstore = FAISS.load_local(path, self.embeddings)
243
+ return vectorstore
244
+ except Exception as e:
245
+ logger.error(f"Error loading vector store: {e}")
246
+ return None
247
+
248
+ def create_qa_chain(self, vectorstore, chain_type: str = "stuff") -> Optional[RetrievalQA]:
249
+ """
250
+ Create QA chain from vector store with fallback options.
251
+
252
+ Args:
253
+ vectorstore: Vector store (FAISS or SimpleVectorStore)
254
+ chain_type (str): Type of chain to create
255
+
256
+ Returns:
257
+ Optional[RetrievalQA]: QA chain or None if failed
258
+ """
259
+ try:
260
+ # Check if it's a simple vector store (fallback mode)
261
+ if isinstance(vectorstore, SimpleVectorStore):
262
+ logger.info("Using simple fallback QA system")
263
+ return FallbackQAChain(vectorstore)
264
+
265
+ retriever = vectorstore.as_retriever(
266
+ search_type="similarity",
267
+ search_kwargs={"k": 4}
268
+ )
269
+
270
+ # Try with OpenAI LLM first
271
+ try:
272
+ qa_chain = RetrievalQA.from_chain_type(
273
+ llm=self.llm,
274
+ chain_type=chain_type,
275
+ retriever=retriever,
276
+ return_source_documents=True
277
+ )
278
+ return qa_chain
279
+ except Exception as openai_error:
280
+ logger.warning(f"OpenAI LLM failed: {openai_error}")
281
+
282
+ # Fallback to a simple text-based QA system
283
+ logger.info("Creating fallback QA system")
284
+ return FallbackQAChain(vectorstore)
285
+
286
+ except Exception as e:
287
+ logger.error(f"Error creating QA chain: {e}")
288
+ return None
289
+
290
+ def _create_fallback_qa_chain(self, vectorstore: FAISS):
291
+ """
292
+ Create a fallback QA chain that works without OpenAI API.
293
+
294
+ Args:
295
+ vectorstore (FAISS): Vector store
296
+
297
+ Returns:
298
+ FallbackQAChain: Simple QA chain
299
+ """
300
+ return FallbackQAChain(vectorstore)
301
+
302
+ def process_transcript(self, transcript_text: str, metadata: Dict[str, Any] = None) -> Dict[str, Any]:
303
+ """
304
+ Process transcript text and create QA chain.
305
+
306
+ Args:
307
+ transcript_text (str): Transcript text
308
+ metadata (Dict[str, Any]): Video metadata
309
+
310
+ Returns:
311
+ Dict[str, Any]: Processing result with QA chain and vector store
312
+ """
313
+ result = {
314
+ 'success': False,
315
+ 'qa_chain': None,
316
+ 'vectorstore': None,
317
+ 'documents': None,
318
+ 'error': None
319
+ }
320
+
321
+ try:
322
+ # Create documents from transcript
323
+ documents = self.create_documents_from_text(transcript_text, metadata)
324
+ if not documents:
325
+ result['error'] = "Failed to create documents from transcript"
326
+ return result
327
+
328
+ # Create vector store
329
+ vectorstore = self.create_vector_store(documents)
330
+ if not vectorstore:
331
+ result['error'] = "Failed to create vector store"
332
+ return result
333
+
334
+ # Create QA chain
335
+ qa_chain = self.create_qa_chain(vectorstore)
336
+ if not qa_chain:
337
+ result['error'] = "Failed to create QA chain"
338
+ return result
339
+
340
+ result['success'] = True
341
+ result['qa_chain'] = qa_chain
342
+ result['vectorstore'] = vectorstore
343
+ result['documents'] = documents
344
+
345
+ except Exception as e:
346
+ result['error'] = f"Error processing transcript: {str(e)}"
347
+ logger.error(f"Error processing transcript: {e}")
348
+
349
+ return result
350
+
351
+ def ask_question(self, qa_chain: RetrievalQA, question: str) -> Dict[str, Any]:
352
+ """
353
+ Ask a question using the QA chain.
354
+
355
+ Args:
356
+ qa_chain (RetrievalQA): QA chain
357
+ question (str): Question to ask
358
+
359
+ Returns:
360
+ Dict[str, Any]: Answer and source documents
361
+ """
362
+ try:
363
+ result = qa_chain({"query": question})
364
+ return {
365
+ 'success': True,
366
+ 'answer': result['result'],
367
+ 'source_documents': result.get('source_documents', []),
368
+ 'error': None
369
+ }
370
+ except Exception as e:
371
+ logger.error(f"Error asking question: {e}")
372
+ return {
373
+ 'success': False,
374
+ 'answer': None,
375
+ 'source_documents': [],
376
+ 'error': f"Error processing question: {str(e)}"
377
+ }
src/src/utils/youtube_handler.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ YouTube video handling utilities for transcript extraction and metadata retrieval.
3
+ """
4
+
5
+ import os
6
+ import re
7
+ import logging
8
+ import time
9
+ import random
10
+ from typing import Optional, Dict, Any, List
11
+ from pytube import YouTube
12
+ from youtube_transcript_api import (
13
+ YouTubeTranscriptApi,
14
+ TranscriptsDisabled,
15
+ NoTranscriptFound,
16
+ VideoUnavailable,
17
+ CouldNotRetrieveTranscript
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ class YouTubeHandler:
23
+ """Handles YouTube video operations including transcript extraction and metadata retrieval."""
24
+
25
+ def __init__(self):
26
+ self.supported_languages = ['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh']
27
+ # Rate limiting to prevent IP blocking
28
+ self.last_request_time = 0
29
+ self.min_request_interval = 3.0 # Minimum 3 seconds between requests
30
+ self.max_retries = 3
31
+ self.base_delay = 2.0
32
+
33
+ def _rate_limit(self):
34
+ """Implement rate limiting to prevent IP blocking."""
35
+ current_time = time.time()
36
+ time_since_last_request = current_time - self.last_request_time
37
+
38
+ if time_since_last_request < self.min_request_interval:
39
+ sleep_time = self.min_request_interval - time_since_last_request
40
+ logger.info(f"Rate limiting: sleeping for {sleep_time:.2f} seconds")
41
+ time.sleep(sleep_time)
42
+
43
+ self.last_request_time = time.time()
44
+
45
+ def _exponential_backoff(self, attempt: int):
46
+ """Implement exponential backoff for retries."""
47
+ delay = self.base_delay * (2 ** attempt) + random.uniform(0, 1)
48
+ logger.info(f"Exponential backoff: attempt {attempt + 1}, sleeping for {delay:.2f} seconds")
49
+ time.sleep(delay)
50
+
51
+ def validate_youtube_url(self, url: str) -> bool:
52
+ """
53
+ Validate if the provided URL is a valid YouTube URL.
54
+
55
+ Args:
56
+ url (str): YouTube URL to validate
57
+
58
+ Returns:
59
+ bool: True if valid, False otherwise
60
+ """
61
+ youtube_regex = re.compile(
62
+ r'(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/'
63
+ r'(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})'
64
+ )
65
+ return bool(youtube_regex.match(url))
66
+
67
+ def extract_video_id(self, url: str) -> Optional[str]:
68
+ """
69
+ Extract video ID from YouTube URL.
70
+
71
+ Args:
72
+ url (str): YouTube URL
73
+
74
+ Returns:
75
+ Optional[str]: Video ID if found, None otherwise
76
+ """
77
+ try:
78
+ yt = YouTube(url)
79
+ return yt.video_id
80
+ except Exception as e:
81
+ logger.error(f"Error extracting video ID: {e}")
82
+ return None
83
+
84
+ def get_video_metadata(self, url: str) -> Dict[str, Any]:
85
+ """
86
+ Get video metadata including title, description, duration, etc.
87
+
88
+ Args:
89
+ url (str): YouTube URL
90
+
91
+ Returns:
92
+ Dict[str, Any]: Video metadata
93
+ """
94
+ try:
95
+ yt = YouTube(url)
96
+ metadata = {
97
+ 'title': yt.title,
98
+ 'description': yt.description,
99
+ 'length': yt.length,
100
+ 'views': yt.views,
101
+ 'rating': getattr(yt, 'rating', None),
102
+ 'author': yt.author,
103
+ 'publish_date': yt.publish_date,
104
+ 'thumbnail_url': yt.thumbnail_url,
105
+ 'video_id': yt.video_id
106
+ }
107
+ return metadata
108
+ except Exception as e:
109
+ logger.error(f"Error getting video metadata: {e}")
110
+ return {}
111
+
112
+ def get_available_transcripts(self, video_id: str) -> List[Dict[str, str]]:
113
+ """
114
+ Get list of available transcript languages for a video.
115
+
116
+ Args:
117
+ video_id (str): YouTube video ID
118
+
119
+ Returns:
120
+ List[Dict[str, str]]: List of available transcripts with language info
121
+ """
122
+ try:
123
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
124
+ available = []
125
+
126
+ for transcript in transcript_list:
127
+ available.append({
128
+ 'language': transcript.language,
129
+ 'language_code': transcript.language_code,
130
+ 'is_generated': transcript.is_generated,
131
+ 'is_translatable': transcript.is_translatable
132
+ })
133
+
134
+ return available
135
+ except Exception as e:
136
+ logger.error(f"Error getting available transcripts: {e}")
137
+ return []
138
+
139
+ def get_youtube_transcript(self, url: str, language: str = 'en') -> Dict[str, Any]:
140
+ """
141
+ Extract transcript from YouTube video with comprehensive error handling and rate limiting.
142
+
143
+ Args:
144
+ url (str): YouTube video URL
145
+ language (str): Preferred language code (default: 'en')
146
+
147
+ Returns:
148
+ Dict[str, Any]: Dictionary containing transcript text and metadata
149
+ """
150
+ result = {
151
+ 'success': False,
152
+ 'transcript': '',
153
+ 'error': None,
154
+ 'metadata': {},
155
+ 'available_languages': []
156
+ }
157
+
158
+ try:
159
+ if not self.validate_youtube_url(url):
160
+ result['error'] = "Invalid YouTube URL format"
161
+ return result
162
+
163
+ video_id = self.extract_video_id(url)
164
+ if not video_id:
165
+ result['error'] = "Could not extract video ID from URL"
166
+ return result
167
+
168
+ # Apply rate limiting before making requests
169
+ self._rate_limit()
170
+
171
+ # Get video metadata
172
+ result['metadata'] = self.get_video_metadata(url)
173
+
174
+ # Apply rate limiting before transcript requests
175
+ self._rate_limit()
176
+
177
+ # Get available transcripts
178
+ result['available_languages'] = self.get_available_transcripts(video_id)
179
+
180
+ # Try to get transcript with multiple strategies and retries
181
+ transcript_data = None
182
+ used_language = None
183
+
184
+ # Strategy 1: Try the standard approach with retries
185
+ for attempt in range(self.max_retries):
186
+ try:
187
+ if attempt > 0:
188
+ self._exponential_backoff(attempt - 1)
189
+
190
+ self._rate_limit()
191
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
192
+
193
+ # Try preferred language first, then fallback to English, then any available
194
+ languages_to_try = [language] if language != 'en' else []
195
+ languages_to_try.extend(['en'])
196
+ languages_to_try.extend([lang['language_code'] for lang in result['available_languages']
197
+ if lang['language_code'] not in languages_to_try])
198
+
199
+ for lang in languages_to_try:
200
+ try:
201
+ transcript = transcript_list.find_transcript([lang])
202
+ transcript_data = transcript.fetch()
203
+ used_language = lang
204
+ logger.info(f"Successfully got transcript in {lang} on attempt {attempt + 1}")
205
+ break
206
+ except (NoTranscriptFound, TranscriptsDisabled):
207
+ continue
208
+
209
+ if transcript_data:
210
+ break
211
+
212
+ except Exception as e:
213
+ logger.warning(f"Standard transcript method failed on attempt {attempt + 1}: {e}")
214
+ if attempt == self.max_retries - 1:
215
+ logger.error(f"All {self.max_retries} attempts failed for standard method")
216
+
217
+ # Strategy 2: Try alternative approach if first failed
218
+ if not transcript_data:
219
+ for attempt in range(self.max_retries):
220
+ try:
221
+ if attempt > 0:
222
+ self._exponential_backoff(attempt - 1)
223
+
224
+ self._rate_limit()
225
+ # Try to get any available transcript without language preference
226
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
227
+ available_transcripts = list(transcript_list)
228
+
229
+ if available_transcripts:
230
+ # Try manual transcripts first
231
+ manual_transcripts = [t for t in available_transcripts if not t.is_generated]
232
+ if manual_transcripts:
233
+ transcript = manual_transcripts[0]
234
+ else:
235
+ transcript = available_transcripts[0]
236
+
237
+ transcript_data = transcript.fetch()
238
+ used_language = transcript.language_code
239
+ logger.info(f"Got transcript using alternative method in {used_language} on attempt {attempt + 1}")
240
+ break
241
+
242
+ except Exception as e:
243
+ logger.warning(f"Alternative transcript method failed on attempt {attempt + 1}: {e}")
244
+ if attempt == self.max_retries - 1:
245
+ logger.error(f"All {self.max_retries} attempts failed for alternative method")
246
+
247
+ # Strategy 3: Try basic method as last resort
248
+ if not transcript_data:
249
+ for attempt in range(self.max_retries):
250
+ try:
251
+ if attempt > 0:
252
+ self._exponential_backoff(attempt - 1)
253
+
254
+ self._rate_limit()
255
+ # This is a last resort - try with minimal parameters
256
+ transcript_data = YouTubeTranscriptApi.get_transcript(video_id)
257
+ used_language = 'auto-detected'
258
+ logger.info(f"Got transcript using basic method on attempt {attempt + 1}")
259
+ break
260
+ except Exception as e:
261
+ logger.warning(f"Basic transcript method failed on attempt {attempt + 1}: {e}")
262
+ if attempt == self.max_retries - 1:
263
+ logger.error(f"All {self.max_retries} attempts failed for basic method")
264
+
265
+ if transcript_data:
266
+ # Format transcript text - handle both dict and object formats
267
+ text_parts = []
268
+ formatted_transcript_data = []
269
+
270
+ for item in transcript_data:
271
+ if hasattr(item, 'text'):
272
+ # New format: object with attributes
273
+ text_parts.append(item.text)
274
+ formatted_transcript_data.append({
275
+ 'text': item.text,
276
+ 'start': getattr(item, 'start', 0),
277
+ 'duration': getattr(item, 'duration', 0)
278
+ })
279
+ elif isinstance(item, dict) and 'text' in item:
280
+ # Old format: dictionary
281
+ text_parts.append(item['text'])
282
+ formatted_transcript_data.append(item)
283
+ else:
284
+ # Fallback: convert to string
285
+ text_parts.append(str(item))
286
+ formatted_transcript_data.append({'text': str(item), 'start': 0, 'duration': 0})
287
+
288
+ text = " ".join(text_parts)
289
+ result['transcript'] = text
290
+ result['success'] = True
291
+ result['used_language'] = used_language
292
+ result['transcript_data'] = formatted_transcript_data # Raw transcript with timestamps
293
+ else:
294
+ result['error'] = "No transcript available in any supported language"
295
+
296
+ except TranscriptsDisabled:
297
+ result['error'] = "Transcripts are disabled for this video"
298
+ except NoTranscriptFound:
299
+ result['error'] = "No transcript found for this video"
300
+ except VideoUnavailable:
301
+ result['error'] = "This video is unavailable"
302
+ except CouldNotRetrieveTranscript as e:
303
+ error_msg = str(e).lower()
304
+ if "ip" in error_msg and "block" in error_msg:
305
+ result['error'] = "IP blocked by YouTube: Too many requests from your IP address"
306
+ result['suggestion'] = "Wait 10-15 minutes before trying again, or try a different network"
307
+ result['details'] = "YouTube has temporarily blocked your IP due to too many requests. This is common when testing or using cloud services."
308
+ elif "region" in error_msg or "country" in error_msg:
309
+ result['error'] = "Regional restriction: This video's transcripts are not available in your region"
310
+ result['suggestion'] = "Try using a VPN or try a different video"
311
+ elif "private" in error_msg:
312
+ result['error'] = "This video is private and transcripts cannot be accessed"
313
+ elif "disabled" in error_msg:
314
+ result['error'] = "Transcripts are disabled for this video"
315
+ elif "cloud provider" in error_msg:
316
+ result['error'] = "Cloud provider IP blocked: YouTube blocks most cloud service IPs"
317
+ result['suggestion'] = "Try from a different network or wait before retrying"
318
+ result['details'] = "YouTube automatically blocks IPs from cloud providers like AWS, Google Cloud, etc."
319
+ else:
320
+ result['error'] = f"Could not retrieve transcript: {str(e)}"
321
+ logger.warning(f"Could not retrieve transcript for video: {e}")
322
+ except Exception as e:
323
+ error_msg = str(e).lower()
324
+ if "ip" in error_msg and ("block" in error_msg or "ban" in error_msg):
325
+ result['error'] = "IP blocked by YouTube: Too many requests from your IP address"
326
+ result['suggestion'] = "Wait 10-15 minutes before trying again, or try a different network"
327
+ result['details'] = "YouTube has temporarily blocked your IP due to too many requests. This is common when testing or using cloud services."
328
+ elif "cloud provider" in error_msg or "aws" in error_msg or "google cloud" in error_msg or "azure" in error_msg:
329
+ result['error'] = "Cloud provider IP blocked: YouTube blocks most cloud service IPs"
330
+ result['suggestion'] = "Try from a different network or wait before retrying"
331
+ result['details'] = "YouTube automatically blocks IPs from cloud providers like AWS, Google Cloud, etc."
332
+ elif "region" in error_msg or "country" in error_msg:
333
+ result['error'] = "Regional restriction: This video's transcripts are not available in your region"
334
+ result['suggestion'] = "Try using a VPN or try a different video"
335
+ elif "private" in error_msg:
336
+ result['error'] = "This video is private and transcripts cannot be accessed"
337
+ elif "unavailable" in error_msg:
338
+ result['error'] = "This video is unavailable or has been removed"
339
+ elif "disabled" in error_msg:
340
+ result['error'] = "Transcripts are disabled for this video"
341
+ elif "too many requests" in error_msg:
342
+ result['error'] = "Rate limited: Too many requests to YouTube"
343
+ result['suggestion'] = "Wait a few minutes before trying again"
344
+ result['details'] = "You've made too many requests to YouTube. Please wait before trying again."
345
+ else:
346
+ result['error'] = f"Unexpected error: {str(e)}"
347
+ logger.error(f"Unexpected error getting transcript: {e}")
348
+
349
+ return result
350
+
351
+ def save_transcript_to_file(self, transcript_text: str, filename: str = "transcript.txt") -> bool:
352
+ """
353
+ Save transcript text to a file.
354
+
355
+ Args:
356
+ transcript_text (str): Transcript text to save
357
+ filename (str): Output filename
358
+
359
+ Returns:
360
+ bool: True if successful, False otherwise
361
+ """
362
+ try:
363
+ os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else '.', exist_ok=True)
364
+ with open(filename, "w", encoding="utf-8") as f:
365
+ f.write(transcript_text)
366
+ return True
367
+ except Exception as e:
368
+ logger.error(f"Error saving transcript to file: {e}")
369
+ return False
src/static/style.css ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* YouTube Transcript Chatbot - Custom Styles - Dark Theme */
2
+
3
+ /* Global dark theme styling */
4
+ * {
5
+ box-sizing: border-box;
6
+ }
7
+
8
+ body, html {
9
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Helvetica Neue', Arial, sans-serif;
10
+ line-height: 1.6;
11
+ color: #e9ecef !important;
12
+ background-color: #1a1a1a !important;
13
+ }
14
+
15
+ /* Ensure all text elements have proper contrast for dark theme */
16
+ p, span, div, label, h1, h2, h3, h4, h5, h6 {
17
+ color: #e9ecef !important;
18
+ text-rendering: optimizeLegibility;
19
+ -webkit-font-smoothing: antialiased;
20
+ -moz-osx-font-smoothing: grayscale;
21
+ }
22
+
23
+ /* Main container styling */
24
+ .main-container {
25
+ max-width: 1200px;
26
+ margin: 0 auto;
27
+ padding: 20px;
28
+ background-color: #1a1a1a !important;
29
+ }
30
+
31
+ /* Header styling */
32
+ .app-header {
33
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
34
+ color: white;
35
+ padding: 2rem;
36
+ border-radius: 10px;
37
+ margin-bottom: 2rem;
38
+ text-align: center;
39
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
40
+ }
41
+
42
+ .app-header h1 {
43
+ margin: 0;
44
+ font-size: 2.5rem;
45
+ font-weight: 700;
46
+ }
47
+
48
+ .app-header p {
49
+ margin: 0.5rem 0 0 0;
50
+ font-size: 1.1rem;
51
+ opacity: 0.9;
52
+ }
53
+
54
+ /* Card styling - Dark Theme */
55
+ .info-card {
56
+ background: #2d3748 !important;
57
+ border-radius: 10px;
58
+ padding: 1.5rem;
59
+ margin: 1rem 0;
60
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.3);
61
+ border-left: 4px solid #667eea;
62
+ color: #e9ecef !important;
63
+ }
64
+
65
+ .success-card {
66
+ background: #1a2e1a !important;
67
+ border-color: #28a745;
68
+ color: #90ee90 !important;
69
+ }
70
+
71
+ .error-card {
72
+ background: #2e1a1a !important;
73
+ border-color: #dc3545;
74
+ color: #ffb3b3 !important;
75
+ }
76
+
77
+ .warning-card {
78
+ background: #2e2a1a !important;
79
+ border-color: #ffc107;
80
+ color: #ffe066 !important;
81
+ }
82
+
83
+ /* Video metadata styling - Dark Theme */
84
+ .video-metadata {
85
+ background: #2d3748 !important;
86
+ border-radius: 8px;
87
+ padding: 1rem;
88
+ margin: 1rem 0;
89
+ border: 1px solid #4a5568;
90
+ }
91
+
92
+ .video-metadata h4 {
93
+ color: #e9ecef !important;
94
+ margin-bottom: 0.5rem;
95
+ }
96
+
97
+ .metadata-item {
98
+ display: flex;
99
+ justify-content: space-between;
100
+ padding: 0.25rem 0;
101
+ border-bottom: 1px solid #4a5568;
102
+ }
103
+
104
+ .metadata-item:last-child {
105
+ border-bottom: none;
106
+ }
107
+
108
+ .metadata-label {
109
+ font-weight: 600;
110
+ color: #a0aec0 !important;
111
+ }
112
+
113
+ .metadata-value {
114
+ color: #e9ecef !important;
115
+ }
116
+
117
+ /* Chat history styling - Dark Theme */
118
+ .chat-container {
119
+ max-height: 400px;
120
+ overflow-y: auto;
121
+ border: 1px solid #4a5568;
122
+ border-radius: 8px;
123
+ padding: 1rem;
124
+ background: #2d3748 !important;
125
+ }
126
+
127
+ .chat-message {
128
+ margin-bottom: 1rem;
129
+ padding: 0.75rem;
130
+ border-radius: 8px;
131
+ }
132
+
133
+ .chat-question {
134
+ background: #1a365d !important;
135
+ border-left: 4px solid #3182ce;
136
+ color: #e9ecef !important;
137
+ }
138
+
139
+ .chat-answer {
140
+ background: #322659 !important;
141
+ border-left: 4px solid #9f7aea;
142
+ margin-left: 1rem;
143
+ color: #e9ecef !important;
144
+ }
145
+
146
+ .chat-timestamp {
147
+ font-size: 0.8rem;
148
+ color: #a0aec0 !important;
149
+ margin-top: 0.5rem;
150
+ }
151
+
152
+ /* Button styling */
153
+ .custom-button {
154
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
155
+ color: white;
156
+ border: none;
157
+ padding: 0.75rem 1.5rem;
158
+ border-radius: 6px;
159
+ font-weight: 600;
160
+ cursor: pointer;
161
+ transition: all 0.3s ease;
162
+ }
163
+
164
+ .custom-button:hover {
165
+ transform: translateY(-2px);
166
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
167
+ }
168
+
169
+ .secondary-button {
170
+ background: #6c757d;
171
+ }
172
+
173
+ .success-button {
174
+ background: #28a745;
175
+ }
176
+
177
+ .danger-button {
178
+ background: #dc3545;
179
+ }
180
+
181
+ /* Loading animation */
182
+ .loading-spinner {
183
+ display: inline-block;
184
+ width: 20px;
185
+ height: 20px;
186
+ border: 3px solid #f3f3f3;
187
+ border-top: 3px solid #667eea;
188
+ border-radius: 50%;
189
+ animation: spin 1s linear infinite;
190
+ }
191
+
192
+ @keyframes spin {
193
+ 0% { transform: rotate(0deg); }
194
+ 100% { transform: rotate(360deg); }
195
+ }
196
+
197
+ /* Progress bar */
198
+ .progress-bar {
199
+ width: 100%;
200
+ height: 6px;
201
+ background: #e9ecef;
202
+ border-radius: 3px;
203
+ overflow: hidden;
204
+ margin: 1rem 0;
205
+ }
206
+
207
+ .progress-fill {
208
+ height: 100%;
209
+ background: linear-gradient(90deg, #667eea, #764ba2);
210
+ border-radius: 3px;
211
+ transition: width 0.3s ease;
212
+ }
213
+
214
+ /* Sidebar styling - Dark Theme */
215
+ .sidebar-content {
216
+ background: #2d3748 !important;
217
+ padding: 1rem;
218
+ border-radius: 8px;
219
+ margin-bottom: 1rem;
220
+ border: 1px solid #4a5568;
221
+ }
222
+
223
+ .sidebar-section {
224
+ margin-bottom: 1.5rem;
225
+ }
226
+
227
+ .sidebar-section h4 {
228
+ color: #e9ecef !important;
229
+ margin-bottom: 0.5rem;
230
+ font-size: 1.1rem;
231
+ }
232
+
233
+ /* Form styling */
234
+ .form-group {
235
+ margin-bottom: 1rem;
236
+ }
237
+
238
+ .form-label {
239
+ display: block;
240
+ margin-bottom: 0.5rem;
241
+ font-weight: 600;
242
+ color: #495057;
243
+ }
244
+
245
+ .form-input {
246
+ width: 100%;
247
+ padding: 0.75rem;
248
+ border: 1px solid #ced4da;
249
+ border-radius: 6px;
250
+ font-size: 1rem;
251
+ transition: border-color 0.3s ease;
252
+ }
253
+
254
+ .form-input:focus {
255
+ outline: none;
256
+ border-color: #667eea;
257
+ box-shadow: 0 0 0 2px rgba(102, 126, 234, 0.25);
258
+ }
259
+
260
+ /* Responsive design */
261
+ @media (max-width: 768px) {
262
+ .app-header h1 {
263
+ font-size: 2rem;
264
+ }
265
+
266
+ .main-container {
267
+ padding: 10px;
268
+ }
269
+
270
+ .info-card {
271
+ padding: 1rem;
272
+ }
273
+
274
+ .chat-answer {
275
+ margin-left: 0.5rem;
276
+ }
277
+ }
278
+
279
+ /* Streamlit specific overrides - Dark Theme */
280
+
281
+ /* Main app background */
282
+ .stApp {
283
+ background-color: #1a1a1a !important;
284
+ color: #e9ecef !important;
285
+ }
286
+
287
+ .stApp > div {
288
+ background-color: #1a1a1a !important;
289
+ }
290
+
291
+ /* Main content area */
292
+ .main .block-container {
293
+ background-color: #1a1a1a !important;
294
+ color: #e9ecef !important;
295
+ }
296
+
297
+ /* Sidebar styling */
298
+ .css-1d391kg, .css-1lcbmhc, .css-1aumxhk {
299
+ background-color: #2d3748 !important;
300
+ color: #e9ecef !important;
301
+ }
302
+
303
+ /* Button styling */
304
+ .stButton > button {
305
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
306
+ color: white !important;
307
+ border: none !important;
308
+ border-radius: 6px !important;
309
+ font-weight: 600 !important;
310
+ transition: all 0.3s ease !important;
311
+ }
312
+
313
+ .stButton > button:hover {
314
+ transform: translateY(-2px) !important;
315
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.4) !important;
316
+ color: white !important;
317
+ }
318
+
319
+ /* Input fields */
320
+ .stSelectbox > div > div {
321
+ border-radius: 6px !important;
322
+ background-color: #2d3748 !important;
323
+ color: #e9ecef !important;
324
+ border: 1px solid #4a5568 !important;
325
+ }
326
+
327
+ .stSelectbox label {
328
+ color: #e9ecef !important;
329
+ }
330
+
331
+ .stTextInput > div > div > input {
332
+ border-radius: 6px !important;
333
+ background-color: #2d3748 !important;
334
+ color: #e9ecef !important;
335
+ border: 1px solid #4a5568 !important;
336
+ }
337
+
338
+ .stTextInput label {
339
+ color: #e9ecef !important;
340
+ }
341
+
342
+ .stTextArea > div > div > textarea {
343
+ border-radius: 6px !important;
344
+ background-color: #2d3748 !important;
345
+ color: #e9ecef !important;
346
+ border: 1px solid #4a5568 !important;
347
+ }
348
+
349
+ .stTextArea label {
350
+ color: #e9ecef !important;
351
+ }
352
+
353
+ /* Success/Error message styling - Dark Theme */
354
+ .stSuccess {
355
+ background: #1a2e1a !important;
356
+ border: 1px solid #28a745 !important;
357
+ border-radius: 6px !important;
358
+ color: #90ee90 !important;
359
+ }
360
+
361
+ .stSuccess p {
362
+ color: #90ee90 !important;
363
+ }
364
+
365
+ .stError {
366
+ background: #2e1a1a !important;
367
+ border: 1px solid #dc3545 !important;
368
+ border-radius: 6px !important;
369
+ color: #ffb3b3 !important;
370
+ }
371
+
372
+ .stError p {
373
+ color: #ffb3b3 !important;
374
+ }
375
+
376
+ .stWarning {
377
+ background: #2e2a1a !important;
378
+ border: 1px solid #ffc107 !important;
379
+ border-radius: 6px !important;
380
+ color: #ffe066 !important;
381
+ }
382
+
383
+ .stWarning p {
384
+ color: #ffe066 !important;
385
+ }
386
+
387
+ .stInfo {
388
+ background: #1a2a2e !important;
389
+ border: 1px solid #17a2b8 !important;
390
+ border-radius: 6px !important;
391
+ color: #66d9ef !important;
392
+ }
393
+
394
+ .stInfo p {
395
+ color: #66d9ef !important;
396
+ }
397
+
398
+ /* Additional dark theme overrides */
399
+ .stMarkdown {
400
+ color: #e9ecef !important;
401
+ }
402
+
403
+ .stMarkdown p {
404
+ color: #e9ecef !important;
405
+ }
406
+
407
+ .stMarkdown h1, .stMarkdown h2, .stMarkdown h3, .stMarkdown h4, .stMarkdown h5, .stMarkdown h6 {
408
+ color: #e9ecef !important;
409
+ }
410
+
411
+ /* Expander styling */
412
+ .streamlit-expanderHeader {
413
+ background-color: #2d3748 !important;
414
+ color: #e9ecef !important;
415
+ border: 1px solid #4a5568 !important;
416
+ }
417
+
418
+ .streamlit-expanderContent {
419
+ background-color: #2d3748 !important;
420
+ color: #e9ecef !important;
421
+ border: 1px solid #4a5568 !important;
422
+ }
423
+
424
+ /* Metric styling */
425
+ .metric-container {
426
+ background-color: #2d3748 !important;
427
+ color: #e9ecef !important;
428
+ }
429
+
430
+ /* Code block styling */
431
+ .stCode {
432
+ background-color: #2d3748 !important;
433
+ color: #e9ecef !important;
434
+ border: 1px solid #4a5568 !important;
435
+ }
436
+
437
+ /* DataFrame styling */
438
+ .stDataFrame {
439
+ background-color: #2d3748 !important;
440
+ color: #e9ecef !important;
441
+ }
442
+
443
+ /* JSON display styling */
444
+ .stJson {
445
+ background-color: #2d3748 !important;
446
+ color: #e9ecef !important;
447
+ }
448
+
449
+ /* Spinner styling */
450
+ .stSpinner {
451
+ color: #667eea !important;
452
+ }
453
+
454
+ /* Progress bar styling */
455
+ .stProgress .st-bo {
456
+ background-color: #667eea !important;
457
+ }
458
+
459
+ /* Custom classes for dark theme */
460
+ .visible-text {
461
+ color: #e9ecef !important;
462
+ background-color: #2d3748 !important;
463
+ padding: 0.5rem !important;
464
+ border-radius: 4px !important;
465
+ border: 1px solid #4a5568 !important;
466
+ }
467
+
468
+ .high-contrast-text {
469
+ color: #ffffff !important;
470
+ background-color: #000000 !important;
471
+ font-weight: 600 !important;
472
+ padding: 0.5rem !important;
473
+ border-radius: 4px !important;
474
+ }
475
+
476
+ /* Override any remaining white backgrounds */
477
+ div[data-testid="stSidebar"] {
478
+ background-color: #2d3748 !important;
479
+ }
480
+
481
+ div[data-testid="stSidebar"] > div {
482
+ background-color: #2d3748 !important;
483
+ }
484
+
485
+ .css-1lcbmhc {
486
+ background-color: #2d3748 !important;
487
+ }
488
+
489
+ .css-1d391kg {
490
+ background-color: #1a1a1a !important;
491
+ }
492
+
493
+ /* Force dark theme on all containers */
494
+ .element-container {
495
+ background-color: transparent !important;
496
+ color: #e9ecef !important;
497
+ }
498
+
499
+ .stAlert {
500
+ color: #e9ecef !important;
501
+ }
src/tests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Tests package
src/tests/test_session_manager.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for session manager functionality.
3
+ """
4
+
5
+ import unittest
6
+ from unittest.mock import patch, MagicMock
7
+ import sys
8
+ import os
9
+ from datetime import datetime
10
+
11
+ # Add src to path for imports
12
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
13
+
14
+ # Mock streamlit before importing session_manager
15
+ sys.modules['streamlit'] = MagicMock()
16
+
17
+ from src.utils.session_manager import SessionManager
18
+
19
+ class TestSessionManager(unittest.TestCase):
20
+ """Test cases for SessionManager class."""
21
+
22
+ def setUp(self):
23
+ """Set up test fixtures."""
24
+ # Mock streamlit session_state
25
+ self.mock_st = MagicMock()
26
+ self.mock_st.session_state = {}
27
+
28
+ with patch('src.utils.session_manager.st', self.mock_st):
29
+ self.session_manager = SessionManager()
30
+
31
+ def test_initialization(self):
32
+ """Test SessionManager initialization."""
33
+ with patch('src.utils.session_manager.st', self.mock_st):
34
+ manager = SessionManager()
35
+
36
+ # Check that session state variables are initialized
37
+ expected_keys = [
38
+ 'chat_history', 'processed_videos', 'current_video',
39
+ 'qa_chain', 'vectorstore', 'video_metadata', 'conversation_id'
40
+ ]
41
+
42
+ for key in expected_keys:
43
+ self.assertIn(key, self.mock_st.session_state)
44
+
45
+ def test_generate_conversation_id(self):
46
+ """Test conversation ID generation."""
47
+ conv_id = self.session_manager.generate_conversation_id()
48
+
49
+ self.assertIsInstance(conv_id, str)
50
+ self.assertTrue(conv_id.startswith('conv_'))
51
+ self.assertEqual(len(conv_id), 19) # conv_ + YYYYMMDD_HHMMSS
52
+
53
+ def test_add_to_chat_history(self):
54
+ """Test adding entries to chat history."""
55
+ with patch('src.utils.session_manager.st', self.mock_st):
56
+ self.mock_st.session_state = {
57
+ 'chat_history': [],
58
+ 'conversation_id': 'test_conv_123'
59
+ }
60
+
61
+ manager = SessionManager()
62
+
63
+ question = "What is this about?"
64
+ answer = "This is a test answer."
65
+ video_id = "test_video_123"
66
+
67
+ manager.add_to_chat_history(question, answer, video_id)
68
+
69
+ self.assertEqual(len(self.mock_st.session_state['chat_history']), 1)
70
+
71
+ entry = self.mock_st.session_state['chat_history'][0]
72
+ self.assertEqual(entry['question'], question)
73
+ self.assertEqual(entry['answer'], answer)
74
+ self.assertEqual(entry['video_id'], video_id)
75
+ self.assertEqual(entry['conversation_id'], 'test_conv_123')
76
+ self.assertIn('timestamp', entry)
77
+
78
+ def test_get_chat_history_all(self):
79
+ """Test getting all chat history."""
80
+ with patch('src.utils.session_manager.st', self.mock_st):
81
+ test_history = [
82
+ {'question': 'Q1', 'answer': 'A1', 'video_id': 'vid1'},
83
+ {'question': 'Q2', 'answer': 'A2', 'video_id': 'vid2'}
84
+ ]
85
+ self.mock_st.session_state = {'chat_history': test_history}
86
+
87
+ manager = SessionManager()
88
+ history = manager.get_chat_history()
89
+
90
+ self.assertEqual(history, test_history)
91
+
92
+ def test_get_chat_history_filtered(self):
93
+ """Test getting filtered chat history by video ID."""
94
+ with patch('src.utils.session_manager.st', self.mock_st):
95
+ test_history = [
96
+ {'question': 'Q1', 'answer': 'A1', 'video_id': 'vid1'},
97
+ {'question': 'Q2', 'answer': 'A2', 'video_id': 'vid2'},
98
+ {'question': 'Q3', 'answer': 'A3', 'video_id': 'vid1'}
99
+ ]
100
+ self.mock_st.session_state = {'chat_history': test_history}
101
+
102
+ manager = SessionManager()
103
+ history = manager.get_chat_history('vid1')
104
+
105
+ self.assertEqual(len(history), 2)
106
+ self.assertEqual(history[0]['video_id'], 'vid1')
107
+ self.assertEqual(history[1]['video_id'], 'vid1')
108
+
109
+ def test_clear_chat_history_all(self):
110
+ """Test clearing all chat history."""
111
+ with patch('src.utils.session_manager.st', self.mock_st):
112
+ test_history = [
113
+ {'question': 'Q1', 'answer': 'A1', 'video_id': 'vid1'},
114
+ {'question': 'Q2', 'answer': 'A2', 'video_id': 'vid2'}
115
+ ]
116
+ self.mock_st.session_state = {'chat_history': test_history}
117
+
118
+ manager = SessionManager()
119
+ manager.clear_chat_history()
120
+
121
+ self.assertEqual(self.mock_st.session_state['chat_history'], [])
122
+
123
+ def test_clear_chat_history_filtered(self):
124
+ """Test clearing chat history for specific video."""
125
+ with patch('src.utils.session_manager.st', self.mock_st):
126
+ test_history = [
127
+ {'question': 'Q1', 'answer': 'A1', 'video_id': 'vid1'},
128
+ {'question': 'Q2', 'answer': 'A2', 'video_id': 'vid2'},
129
+ {'question': 'Q3', 'answer': 'A3', 'video_id': 'vid1'}
130
+ ]
131
+ self.mock_st.session_state = {'chat_history': test_history}
132
+
133
+ manager = SessionManager()
134
+ manager.clear_chat_history('vid1')
135
+
136
+ remaining_history = self.mock_st.session_state['chat_history']
137
+ self.assertEqual(len(remaining_history), 1)
138
+ self.assertEqual(remaining_history[0]['video_id'], 'vid2')
139
+
140
+ def test_save_processed_video(self):
141
+ """Test saving processed video information."""
142
+ with patch('src.utils.session_manager.st', self.mock_st):
143
+ self.mock_st.session_state = {
144
+ 'processed_videos': {},
145
+ 'conversation_id': 'test_conv_123'
146
+ }
147
+
148
+ manager = SessionManager()
149
+
150
+ video_url = "https://youtube.com/watch?v=test123"
151
+ video_id = "test123"
152
+ metadata = {"title": "Test Video", "author": "Test Author"}
153
+ transcript = "This is a test transcript."
154
+ qa_chain = MagicMock()
155
+ vectorstore = MagicMock()
156
+
157
+ manager.save_processed_video(
158
+ video_url, video_id, metadata, transcript, qa_chain, vectorstore
159
+ )
160
+
161
+ # Check processed_videos
162
+ self.assertIn(video_id, self.mock_st.session_state['processed_videos'])
163
+ saved_video = self.mock_st.session_state['processed_videos'][video_id]
164
+
165
+ self.assertEqual(saved_video['url'], video_url)
166
+ self.assertEqual(saved_video['metadata'], metadata)
167
+ self.assertEqual(saved_video['transcript'], transcript)
168
+ self.assertEqual(saved_video['conversation_id'], 'test_conv_123')
169
+ self.assertIn('processed_at', saved_video)
170
+
171
+ # Check current session state
172
+ self.assertEqual(self.mock_st.session_state['current_video'], video_id)
173
+ self.assertEqual(self.mock_st.session_state['qa_chain'], qa_chain)
174
+ self.assertEqual(self.mock_st.session_state['vectorstore'], vectorstore)
175
+ self.assertEqual(self.mock_st.session_state['video_metadata'], metadata)
176
+
177
+ def test_get_processed_videos(self):
178
+ """Test getting processed videos."""
179
+ with patch('src.utils.session_manager.st', self.mock_st):
180
+ test_videos = {
181
+ 'vid1': {'title': 'Video 1'},
182
+ 'vid2': {'title': 'Video 2'}
183
+ }
184
+ self.mock_st.session_state = {'processed_videos': test_videos}
185
+
186
+ manager = SessionManager()
187
+ videos = manager.get_processed_videos()
188
+
189
+ self.assertEqual(videos, test_videos)
190
+
191
+ def test_switch_to_video_success(self):
192
+ """Test successful video switching."""
193
+ with patch('src.utils.session_manager.st', self.mock_st):
194
+ test_videos = {
195
+ 'vid1': {'title': 'Video 1'},
196
+ 'vid2': {'title': 'Video 2'}
197
+ }
198
+ self.mock_st.session_state = {'processed_videos': test_videos}
199
+
200
+ manager = SessionManager()
201
+ result = manager.switch_to_video('vid1')
202
+
203
+ self.assertTrue(result)
204
+ self.assertEqual(self.mock_st.session_state['current_video'], 'vid1')
205
+
206
+ def test_switch_to_video_failure(self):
207
+ """Test video switching failure."""
208
+ with patch('src.utils.session_manager.st', self.mock_st):
209
+ self.mock_st.session_state = {'processed_videos': {}}
210
+
211
+ manager = SessionManager()
212
+ result = manager.switch_to_video('nonexistent_vid')
213
+
214
+ self.assertFalse(result)
215
+
216
+ def test_export_chat_history_json(self):
217
+ """Test exporting chat history as JSON."""
218
+ with patch('src.utils.session_manager.st', self.mock_st):
219
+ test_history = [
220
+ {'question': 'Q1', 'answer': 'A1', 'timestamp': '2024-01-01T12:00:00'}
221
+ ]
222
+ self.mock_st.session_state = {'chat_history': test_history}
223
+
224
+ manager = SessionManager()
225
+ result = manager.export_chat_history('json')
226
+
227
+ self.assertIsInstance(result, str)
228
+ self.assertIn('Q1', result)
229
+ self.assertIn('A1', result)
230
+
231
+ def test_export_chat_history_txt(self):
232
+ """Test exporting chat history as text."""
233
+ with patch('src.utils.session_manager.st', self.mock_st):
234
+ test_history = [
235
+ {'question': 'Q1', 'answer': 'A1', 'timestamp': '2024-01-01T12:00:00'}
236
+ ]
237
+ self.mock_st.session_state = {'chat_history': test_history}
238
+
239
+ manager = SessionManager()
240
+ result = manager.export_chat_history('txt')
241
+
242
+ self.assertIsInstance(result, str)
243
+ self.assertIn('Question: Q1', result)
244
+ self.assertIn('Answer: A1', result)
245
+
246
+ def test_get_session_stats(self):
247
+ """Test getting session statistics."""
248
+ with patch('src.utils.session_manager.st', self.mock_st):
249
+ test_history = [
250
+ {'question': 'Q1', 'timestamp': '2024-01-01T12:00:00'},
251
+ {'question': 'Q2', 'timestamp': '2024-01-01T13:00:00'}
252
+ ]
253
+ test_videos = {'vid1': {}, 'vid2': {}}
254
+
255
+ self.mock_st.session_state = {
256
+ 'chat_history': test_history,
257
+ 'processed_videos': test_videos,
258
+ 'current_video': 'vid1',
259
+ 'conversation_id': 'test_conv_123'
260
+ }
261
+
262
+ manager = SessionManager()
263
+ stats = manager.get_session_stats()
264
+
265
+ self.assertEqual(stats['total_questions'], 2)
266
+ self.assertEqual(stats['processed_videos'], 2)
267
+ self.assertEqual(stats['current_video'], 'vid1')
268
+ self.assertEqual(stats['conversation_id'], 'test_conv_123')
269
+ self.assertIn('session_start', stats)
270
+
271
+ if __name__ == '__main__':
272
+ unittest.main()
src/tests/test_text_processor.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for text processor functionality.
3
+ """
4
+
5
+ import unittest
6
+ from unittest.mock import patch, MagicMock
7
+ import sys
8
+ import os
9
+
10
+ # Add src to path for imports
11
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
12
+
13
+ from src.utils.text_processor import TextProcessor
14
+
15
+ class TestTextProcessor(unittest.TestCase):
16
+ """Test cases for TextProcessor class."""
17
+
18
+ def setUp(self):
19
+ """Set up test fixtures."""
20
+ self.api_key = "test_api_key"
21
+ self.processor = TextProcessor(self.api_key)
22
+
23
+ def test_initialization(self):
24
+ """Test TextProcessor initialization."""
25
+ self.assertEqual(self.processor.openai_api_key, self.api_key)
26
+ self.assertIsNotNone(self.processor.embeddings)
27
+ self.assertIsNotNone(self.processor.llm)
28
+
29
+ def test_create_documents_from_text(self):
30
+ """Test document creation from text."""
31
+ text = "This is a test transcript. It has multiple sentences."
32
+ metadata = {"video_id": "test123", "title": "Test Video"}
33
+
34
+ documents = self.processor.create_documents_from_text(text, metadata)
35
+
36
+ self.assertIsInstance(documents, list)
37
+ self.assertGreater(len(documents), 0)
38
+
39
+ # Check first document
40
+ first_doc = documents[0]
41
+ self.assertIn("test transcript", first_doc.page_content.lower())
42
+ self.assertEqual(first_doc.metadata["video_id"], "test123")
43
+ self.assertEqual(first_doc.metadata["title"], "Test Video")
44
+
45
+ def test_create_documents_from_text_no_metadata(self):
46
+ """Test document creation without metadata."""
47
+ text = "Simple test text."
48
+
49
+ documents = self.processor.create_documents_from_text(text)
50
+
51
+ self.assertIsInstance(documents, list)
52
+ self.assertGreater(len(documents), 0)
53
+ self.assertEqual(documents[0].metadata, {})
54
+
55
+ @patch('src.utils.text_processor.FAISS')
56
+ def test_create_vector_store_success(self, mock_faiss):
57
+ """Test successful vector store creation."""
58
+ mock_vectorstore = MagicMock()
59
+ mock_faiss.from_documents.return_value = mock_vectorstore
60
+
61
+ documents = [MagicMock()]
62
+ result = self.processor.create_vector_store(documents)
63
+
64
+ self.assertEqual(result, mock_vectorstore)
65
+ mock_faiss.from_documents.assert_called_once_with(documents, self.processor.embeddings)
66
+
67
+ def test_create_vector_store_empty_documents(self):
68
+ """Test vector store creation with empty documents."""
69
+ result = self.processor.create_vector_store([])
70
+ self.assertIsNone(result)
71
+
72
+ @patch('src.utils.text_processor.FAISS')
73
+ def test_create_vector_store_failure(self, mock_faiss):
74
+ """Test vector store creation failure."""
75
+ mock_faiss.from_documents.side_effect = Exception("Test error")
76
+
77
+ documents = [MagicMock()]
78
+ result = self.processor.create_vector_store(documents)
79
+
80
+ self.assertIsNone(result)
81
+
82
+ @patch('src.utils.text_processor.RetrievalQA')
83
+ def test_create_qa_chain_success(self, mock_retrieval_qa):
84
+ """Test successful QA chain creation."""
85
+ mock_qa_chain = MagicMock()
86
+ mock_retrieval_qa.from_chain_type.return_value = mock_qa_chain
87
+
88
+ mock_vectorstore = MagicMock()
89
+ mock_retriever = MagicMock()
90
+ mock_vectorstore.as_retriever.return_value = mock_retriever
91
+
92
+ result = self.processor.create_qa_chain(mock_vectorstore)
93
+
94
+ self.assertEqual(result, mock_qa_chain)
95
+ mock_vectorstore.as_retriever.assert_called_once()
96
+ mock_retrieval_qa.from_chain_type.assert_called_once()
97
+
98
+ @patch('src.utils.text_processor.RetrievalQA')
99
+ def test_create_qa_chain_failure(self, mock_retrieval_qa):
100
+ """Test QA chain creation failure."""
101
+ mock_retrieval_qa.from_chain_type.side_effect = Exception("Test error")
102
+
103
+ mock_vectorstore = MagicMock()
104
+ result = self.processor.create_qa_chain(mock_vectorstore)
105
+
106
+ self.assertIsNone(result)
107
+
108
+ def test_ask_question_success(self):
109
+ """Test successful question asking."""
110
+ mock_qa_chain = MagicMock()
111
+ mock_qa_chain.return_value = {
112
+ 'result': 'Test answer',
113
+ 'source_documents': [MagicMock()]
114
+ }
115
+
116
+ question = "What is this about?"
117
+ result = self.processor.ask_question(mock_qa_chain, question)
118
+
119
+ self.assertTrue(result['success'])
120
+ self.assertEqual(result['answer'], 'Test answer')
121
+ self.assertIsNotNone(result['source_documents'])
122
+ self.assertIsNone(result['error'])
123
+
124
+ mock_qa_chain.assert_called_once_with({"query": question})
125
+
126
+ def test_ask_question_failure(self):
127
+ """Test question asking failure."""
128
+ mock_qa_chain = MagicMock()
129
+ mock_qa_chain.side_effect = Exception("Test error")
130
+
131
+ question = "What is this about?"
132
+ result = self.processor.ask_question(mock_qa_chain, question)
133
+
134
+ self.assertFalse(result['success'])
135
+ self.assertIsNone(result['answer'])
136
+ self.assertEqual(result['source_documents'], [])
137
+ self.assertIsNotNone(result['error'])
138
+
139
+ @patch.object(TextProcessor, 'create_qa_chain')
140
+ @patch.object(TextProcessor, 'create_vector_store')
141
+ @patch.object(TextProcessor, 'create_documents_from_text')
142
+ def test_process_transcript_success(self, mock_create_docs, mock_create_vs, mock_create_qa):
143
+ """Test successful transcript processing."""
144
+ # Setup mocks
145
+ mock_documents = [MagicMock()]
146
+ mock_vectorstore = MagicMock()
147
+ mock_qa_chain = MagicMock()
148
+
149
+ mock_create_docs.return_value = mock_documents
150
+ mock_create_vs.return_value = mock_vectorstore
151
+ mock_create_qa.return_value = mock_qa_chain
152
+
153
+ transcript_text = "Test transcript text"
154
+ metadata = {"video_id": "test123"}
155
+
156
+ result = self.processor.process_transcript(transcript_text, metadata)
157
+
158
+ self.assertTrue(result['success'])
159
+ self.assertEqual(result['qa_chain'], mock_qa_chain)
160
+ self.assertEqual(result['vectorstore'], mock_vectorstore)
161
+ self.assertEqual(result['documents'], mock_documents)
162
+ self.assertIsNone(result['error'])
163
+
164
+ mock_create_docs.assert_called_once_with(transcript_text, metadata)
165
+ mock_create_vs.assert_called_once_with(mock_documents)
166
+ mock_create_qa.assert_called_once_with(mock_vectorstore)
167
+
168
+ @patch.object(TextProcessor, 'create_documents_from_text')
169
+ def test_process_transcript_document_creation_failure(self, mock_create_docs):
170
+ """Test transcript processing with document creation failure."""
171
+ mock_create_docs.return_value = []
172
+
173
+ transcript_text = "Test transcript text"
174
+ result = self.processor.process_transcript(transcript_text)
175
+
176
+ self.assertFalse(result['success'])
177
+ self.assertIsNone(result['qa_chain'])
178
+ self.assertIsNone(result['vectorstore'])
179
+ self.assertIsNone(result['documents'])
180
+ self.assertEqual(result['error'], "Failed to create documents from transcript")
181
+
182
+ @patch.object(TextProcessor, 'create_vector_store')
183
+ @patch.object(TextProcessor, 'create_documents_from_text')
184
+ def test_process_transcript_vectorstore_creation_failure(self, mock_create_docs, mock_create_vs):
185
+ """Test transcript processing with vector store creation failure."""
186
+ mock_create_docs.return_value = [MagicMock()]
187
+ mock_create_vs.return_value = None
188
+
189
+ transcript_text = "Test transcript text"
190
+ result = self.processor.process_transcript(transcript_text)
191
+
192
+ self.assertFalse(result['success'])
193
+ self.assertIsNone(result['qa_chain'])
194
+ self.assertIsNone(result['vectorstore'])
195
+ self.assertIsNotNone(result['documents'])
196
+ self.assertEqual(result['error'], "Failed to create vector store")
197
+
198
+ @patch.object(TextProcessor, 'create_qa_chain')
199
+ @patch.object(TextProcessor, 'create_vector_store')
200
+ @patch.object(TextProcessor, 'create_documents_from_text')
201
+ def test_process_transcript_qa_chain_creation_failure(self, mock_create_docs, mock_create_vs, mock_create_qa):
202
+ """Test transcript processing with QA chain creation failure."""
203
+ mock_create_docs.return_value = [MagicMock()]
204
+ mock_create_vs.return_value = MagicMock()
205
+ mock_create_qa.return_value = None
206
+
207
+ transcript_text = "Test transcript text"
208
+ result = self.processor.process_transcript(transcript_text)
209
+
210
+ self.assertFalse(result['success'])
211
+ self.assertIsNone(result['qa_chain'])
212
+ self.assertIsNotNone(result['vectorstore'])
213
+ self.assertIsNotNone(result['documents'])
214
+ self.assertEqual(result['error'], "Failed to create QA chain")
215
+
216
+ if __name__ == '__main__':
217
+ unittest.main()
src/tests/test_youtube_handler.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for YouTube handler functionality.
3
+ """
4
+
5
+ import unittest
6
+ from unittest.mock import patch, MagicMock
7
+ import sys
8
+ import os
9
+
10
+ # Add src to path for imports
11
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
12
+
13
+ from src.utils.youtube_handler import YouTubeHandler
14
+
15
+ class TestYouTubeHandler(unittest.TestCase):
16
+ """Test cases for YouTubeHandler class."""
17
+
18
+ def setUp(self):
19
+ """Set up test fixtures."""
20
+ self.handler = YouTubeHandler()
21
+
22
+ def test_validate_youtube_url_valid(self):
23
+ """Test URL validation with valid URLs."""
24
+ valid_urls = [
25
+ "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
26
+ "https://youtu.be/dQw4w9WgXcQ",
27
+ "http://youtube.com/watch?v=dQw4w9WgXcQ",
28
+ "https://m.youtube.com/watch?v=dQw4w9WgXcQ"
29
+ ]
30
+
31
+ for url in valid_urls:
32
+ with self.subTest(url=url):
33
+ self.assertTrue(self.handler.validate_youtube_url(url))
34
+
35
+ def test_validate_youtube_url_invalid(self):
36
+ """Test URL validation with invalid URLs."""
37
+ invalid_urls = [
38
+ "https://www.google.com",
39
+ "not_a_url",
40
+ "https://vimeo.com/123456",
41
+ "",
42
+ None
43
+ ]
44
+
45
+ for url in invalid_urls:
46
+ with self.subTest(url=url):
47
+ if url is not None:
48
+ self.assertFalse(self.handler.validate_youtube_url(url))
49
+
50
+ @patch('src.utils.youtube_handler.YouTube')
51
+ def test_extract_video_id_success(self, mock_youtube):
52
+ """Test successful video ID extraction."""
53
+ mock_yt = MagicMock()
54
+ mock_yt.video_id = "dQw4w9WgXcQ"
55
+ mock_youtube.return_value = mock_yt
56
+
57
+ video_id = self.handler.extract_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
58
+ self.assertEqual(video_id, "dQw4w9WgXcQ")
59
+
60
+ @patch('src.utils.youtube_handler.YouTube')
61
+ def test_extract_video_id_failure(self, mock_youtube):
62
+ """Test video ID extraction failure."""
63
+ mock_youtube.side_effect = Exception("Invalid URL")
64
+
65
+ video_id = self.handler.extract_video_id("invalid_url")
66
+ self.assertIsNone(video_id)
67
+
68
+ @patch('src.utils.youtube_handler.YouTube')
69
+ def test_get_video_metadata_success(self, mock_youtube):
70
+ """Test successful video metadata retrieval."""
71
+ mock_yt = MagicMock()
72
+ mock_yt.title = "Test Video"
73
+ mock_yt.author = "Test Author"
74
+ mock_yt.length = 300
75
+ mock_yt.views = 1000
76
+ mock_yt.video_id = "dQw4w9WgXcQ"
77
+ mock_youtube.return_value = mock_yt
78
+
79
+ metadata = self.handler.get_video_metadata("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
80
+
81
+ self.assertEqual(metadata['title'], "Test Video")
82
+ self.assertEqual(metadata['author'], "Test Author")
83
+ self.assertEqual(metadata['length'], 300)
84
+ self.assertEqual(metadata['views'], 1000)
85
+ self.assertEqual(metadata['video_id'], "dQw4w9WgXcQ")
86
+
87
+ @patch('src.utils.youtube_handler.YouTube')
88
+ def test_get_video_metadata_failure(self, mock_youtube):
89
+ """Test video metadata retrieval failure."""
90
+ mock_youtube.side_effect = Exception("Network error")
91
+
92
+ metadata = self.handler.get_video_metadata("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
93
+ self.assertEqual(metadata, {})
94
+
95
+ def test_save_transcript_to_file(self):
96
+ """Test transcript file saving."""
97
+ test_text = "This is a test transcript."
98
+ test_file = "test_transcript.txt"
99
+
100
+ try:
101
+ result = self.handler.save_transcript_to_file(test_text, test_file)
102
+ self.assertTrue(result)
103
+
104
+ # Verify file was created and contains correct content
105
+ with open(test_file, 'r', encoding='utf-8') as f:
106
+ content = f.read()
107
+ self.assertEqual(content, test_text)
108
+
109
+ finally:
110
+ # Clean up
111
+ if os.path.exists(test_file):
112
+ os.remove(test_file)
113
+
114
+ if __name__ == '__main__':
115
+ unittest.main()