Kackle commited on
Commit
1d4ab9d
·
verified ·
1 Parent(s): 4e65abc

remove youtube because of bot issues

Browse files
Files changed (1) hide show
  1. video_parser.py +178 -78
video_parser.py CHANGED
@@ -1,91 +1,191 @@
1
- import cv2
2
- import requests
3
- import tempfile
4
  import os
5
- from urllib.parse import urlparse, parse_qs
6
- import yt_dlp
 
 
 
 
7
 
8
- class VideoParser:
 
 
9
  def __init__(self):
10
- self.temp_dir = tempfile.mkdtemp()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- def download_youtube_video(self, url: str) -> str:
13
- """Download YouTube video and return local path"""
14
- ydl_opts = {
15
- 'format': 'worst[height<=480]/worst',
16
- 'outtmpl': os.path.join(self.temp_dir, '%(title)s.%(ext)s'),
17
- 'quiet': True,
18
- 'no_warnings': True,
19
- 'extract_flat': False,
20
- 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  }
22
 
23
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
24
- info = ydl.extract_info(url, download=True)
25
- return ydl.prepare_filename(info)
 
 
 
 
 
 
 
 
 
 
26
 
27
- def analyze_video_frames(self, video_path: str, sample_rate: int = 30):
28
- """Analyze video frames for object detection/counting"""
29
- cap = cv2.VideoCapture(video_path)
30
- frame_count = 0
31
- results = []
32
-
33
- while cap.isOpened():
34
- ret, frame = cap.read()
35
- if not ret:
 
36
  break
37
-
38
- if frame_count % sample_rate == 0:
39
- # Basic frame analysis - you'd integrate with object detection here
40
- results.append({
41
- 'frame': frame_count,
42
- 'timestamp': frame_count / cap.get(cv2.CAP_PROP_FPS),
43
- 'frame_data': frame
44
- })
45
-
46
- frame_count += 1
47
 
48
- cap.release()
49
- return results
50
-
51
- def extract_audio(self, video_path: str) -> str:
52
- """Extract audio from video for speech analysis"""
53
- audio_path = video_path.rsplit('.', 1)[0] + '.wav'
54
-
55
- # Use ffmpeg to extract audio
56
- import subprocess
57
- subprocess.run([
58
- 'ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le',
59
- '-ar', '16000', '-ac', '1', audio_path, '-y'
60
- ], capture_output=True)
61
-
62
- return audio_path
63
-
64
- def get_youtube_metadata(self, url: str) -> dict:
65
- """Extract YouTube video metadata without downloading"""
66
  try:
67
- ydl_opts = {
68
- 'quiet': True,
69
- 'no_download': True,
70
- 'extract_flat': False
71
- }
72
-
73
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
74
- info = ydl.extract_info(url, download=False)
75
-
76
- return {
77
- 'title': info.get('title', 'Unknown'),
78
- 'description': info.get('description', '')[:500],
79
- 'duration': info.get('duration', 0),
80
- 'view_count': info.get('view_count', 0),
81
- 'upload_date': info.get('upload_date', 'Unknown'),
82
- 'uploader': info.get('uploader', 'Unknown')
83
- }
84
 
85
  except Exception as e:
86
- return {'error': str(e)}
87
 
88
- def cleanup(self):
89
- """Clean up temporary files"""
90
- import shutil
91
- shutil.rmtree(self.temp_dir, ignore_errors=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import boto3
3
+ import json
4
+ from dotenv import load_dotenv
5
+ from video_parser import VideoParser
6
+ from excel_parser import ExcelParser
7
+ import re
8
 
9
+ load_dotenv()
10
+
11
+ class NovaProAgent:
12
  def __init__(self):
13
+ print("NovaProAgent initialized.")
14
+
15
+ # Get AWS credentials from environment variables
16
+ aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
17
+ aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
18
+
19
+ # Initialize the AWS client
20
+ boto3.client(
21
+ 's3',
22
+ aws_access_key_id=aws_access_key_id,
23
+ aws_secret_access_key=aws_secret_access_key
24
+ )
25
+ session = boto3.session.Session()
26
+
27
+ self.bedrock_client = boto3.client(
28
+ service_name='bedrock-runtime',
29
+ region_name='us-east-1'
30
+ )
31
+
32
+ self.model_id = "amazon.nova-pro-v1:0"
33
+ self.content_type = "application/json"
34
+ self.accept = "application/json"
35
+
36
+ # Initialize parsers
37
+ self.video_parser = VideoParser()
38
+ self.excel_parser = ExcelParser()
39
+
40
+ async def __call__(self, question: str) -> str:
41
+ print(f"NovaProAgent received question (first 50 chars): {question}...")
42
+
43
+ try:
44
+ # Check if question involves video analysis
45
+ if 'youtube.com' in question or 'video' in question.lower():
46
+ return await self._handle_video_question(question)
47
+
48
+ # Check if question involves Excel files
49
+ if '.xlsx' in question or '.xls' in question or 'excel' in question.lower():
50
+ return await self._handle_excel_question(question)
51
+
52
+ # Regular text-based question
53
+ return await self._handle_text_question(question)
54
+
55
+ except Exception as e:
56
+ print(f"Error processing question: {e}")
57
+ return "Unable to process request."
58
 
59
+ async def _handle_video_question(self, question: str) -> str:
60
+ """Handle questions that require video analysis"""
61
+ # Extract YouTube URL
62
+ youtube_url = re.search(r'https://www\.youtube\.com/watch\?v=[\w-]+', question)
63
+ if not youtube_url:
64
+ return "No valid YouTube URL found in question."
65
+
66
+ url = youtube_url.group()
67
+
68
+ # Extract video ID for reference
69
+ video_id = re.search(r'v=([\w-]+)', url).group(1)
70
+
71
+ # Use Nova Pro to provide intelligent response about video analysis
72
+ video_prompt = f"""User is asking about a YouTube video: {url}
73
+ Video ID: {video_id}
74
+ User question: {question}
75
+
76
+ Provide a helpful response about video analysis limitations and suggest alternatives."""
77
+
78
+ payload = {
79
+ "messages": [{
80
+ "role": "user",
81
+ "content": [{"text": video_prompt}]
82
+ }],
83
+ "inferenceConfig": {
84
+ "max_new_tokens": 150,
85
+ "temperature": 0.0
86
+ }
87
  }
88
 
89
+ try:
90
+ response = self.bedrock_client.invoke_model(
91
+ modelId=self.model_id,
92
+ contentType=self.content_type,
93
+ accept=self.accept,
94
+ body=json.dumps(payload)
95
+ )
96
+
97
+ response_body = json.loads(response['body'].read())
98
+ return response_body['output']['message']['content'][0]['text'].strip()
99
+
100
+ except Exception as e:
101
+ return f"Video ID: {video_id}. Direct video analysis unavailable due to access restrictions."
102
 
103
+ async def _handle_excel_question(self, question: str) -> str:
104
+ """Handle questions that require Excel file analysis"""
105
+ # Extract file path from question if present
106
+ file_patterns = [r'([A-Za-z]:\\[^\s]+\.xlsx?)', r'([^\s]+\.xlsx?)']
107
+ file_path = None
108
+
109
+ for pattern in file_patterns:
110
+ match = re.search(pattern, question)
111
+ if match:
112
+ file_path = match.group(1)
113
  break
 
 
 
 
 
 
 
 
 
 
114
 
115
+ if not file_path:
116
+ return "Please provide Excel file path in your question."
117
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  try:
119
+ if 'sales' in question.lower() and 'food' in question.lower():
120
+ results = self.excel_parser.analyze_sales_data(file_path)
121
+ return results.get('total_food_sales', 'No sales data found')
122
+ else:
123
+ df = self.excel_parser.read_excel_file(file_path)
124
+ return f"Excel file loaded with {len(df)} rows and {len(df.columns)} columns."
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  except Exception as e:
127
+ return f"Excel analysis failed: {str(e)}"
128
 
129
+ async def _handle_text_question(self, question: str) -> str:
130
+ """Handle regular text-based questions"""
131
+ # Create a more focused prompt for concise answers
132
+ prompt = f"""Answer this question directly and concisely. Provide only the essential information requested, not explanations or step-by-step reasoning unless specifically asked.
133
+
134
+ Question: {question}
135
+
136
+ Answer:"""
137
+
138
+ # Prepare the request payload for Nova Pro
139
+ payload = {
140
+ "messages": [
141
+ {
142
+ "role": "user",
143
+ "content": [{
144
+ "text": prompt
145
+ }]
146
+ }
147
+ ],
148
+ "inferenceConfig": {
149
+ "max_new_tokens": 250,
150
+ "temperature": 0.0
151
+ }
152
+ }
153
+
154
+ # Call Nova Pro model
155
+ response = self.bedrock_client.invoke_model(
156
+ modelId=self.model_id,
157
+ contentType=self.content_type,
158
+ accept=self.accept,
159
+ body=json.dumps(payload)
160
+ )
161
+
162
+ # Parse response
163
+ response_body = json.loads(response['body'].read())
164
+ answer = response_body['output']['message']['content'][0]['text']
165
+
166
+ # Clean up the answer
167
+ answer = answer.strip()
168
+
169
+ # Remove verbose beginnings
170
+ verbose_starts = [
171
+ "To answer this question",
172
+ "Based on the information",
173
+ "According to",
174
+ "The answer is",
175
+ "Looking at"
176
+ ]
177
+
178
+ for start in verbose_starts:
179
+ if answer.lower().startswith(start.lower()):
180
+ sentences = answer.split('. ')
181
+ for sentence in sentences[1:]:
182
+ if len(sentence.strip()) > 10:
183
+ answer = sentence.strip()
184
+ break
185
+
186
+ # Limit length
187
+ if len(answer) > 200:
188
+ sentences = answer.split('. ')
189
+ answer = sentences[0] + '.'
190
+
191
+ return answer