Kackle commited on
Commit
d8e49a7
·
verified ·
1 Parent(s): 1d4ab9d

mistake copy again

Browse files
Files changed (1) hide show
  1. video_parser.py +78 -178
video_parser.py CHANGED
@@ -1,191 +1,91 @@
 
 
 
1
  import os
2
- import boto3
3
- import json
4
- from dotenv import load_dotenv
5
- from video_parser import VideoParser
6
- from excel_parser import ExcelParser
7
- import re
8
 
9
- load_dotenv()
10
-
11
- class NovaProAgent:
12
  def __init__(self):
13
- print("NovaProAgent initialized.")
14
-
15
- # Get AWS credentials from environment variables
16
- aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
17
- aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
18
-
19
- # Initialize the AWS client
20
- boto3.client(
21
- 's3',
22
- aws_access_key_id=aws_access_key_id,
23
- aws_secret_access_key=aws_secret_access_key
24
- )
25
- session = boto3.session.Session()
26
-
27
- self.bedrock_client = boto3.client(
28
- service_name='bedrock-runtime',
29
- region_name='us-east-1'
30
- )
31
-
32
- self.model_id = "amazon.nova-pro-v1:0"
33
- self.content_type = "application/json"
34
- self.accept = "application/json"
35
-
36
- # Initialize parsers
37
- self.video_parser = VideoParser()
38
- self.excel_parser = ExcelParser()
39
-
40
- async def __call__(self, question: str) -> str:
41
- print(f"NovaProAgent received question (first 50 chars): {question}...")
42
-
43
- try:
44
- # Check if question involves video analysis
45
- if 'youtube.com' in question or 'video' in question.lower():
46
- return await self._handle_video_question(question)
47
-
48
- # Check if question involves Excel files
49
- if '.xlsx' in question or '.xls' in question or 'excel' in question.lower():
50
- return await self._handle_excel_question(question)
51
-
52
- # Regular text-based question
53
- return await self._handle_text_question(question)
54
-
55
- except Exception as e:
56
- print(f"Error processing question: {e}")
57
- return "Unable to process request."
58
 
59
- async def _handle_video_question(self, question: str) -> str:
60
- """Handle questions that require video analysis"""
61
- # Extract YouTube URL
62
- youtube_url = re.search(r'https://www\.youtube\.com/watch\?v=[\w-]+', question)
63
- if not youtube_url:
64
- return "No valid YouTube URL found in question."
65
-
66
- url = youtube_url.group()
67
-
68
- # Extract video ID for reference
69
- video_id = re.search(r'v=([\w-]+)', url).group(1)
70
-
71
- # Use Nova Pro to provide intelligent response about video analysis
72
- video_prompt = f"""User is asking about a YouTube video: {url}
73
- Video ID: {video_id}
74
- User question: {question}
75
-
76
- Provide a helpful response about video analysis limitations and suggest alternatives."""
77
-
78
- payload = {
79
- "messages": [{
80
- "role": "user",
81
- "content": [{"text": video_prompt}]
82
- }],
83
- "inferenceConfig": {
84
- "max_new_tokens": 150,
85
- "temperature": 0.0
86
- }
87
  }
88
 
89
- try:
90
- response = self.bedrock_client.invoke_model(
91
- modelId=self.model_id,
92
- contentType=self.content_type,
93
- accept=self.accept,
94
- body=json.dumps(payload)
95
- )
96
-
97
- response_body = json.loads(response['body'].read())
98
- return response_body['output']['message']['content'][0]['text'].strip()
99
-
100
- except Exception as e:
101
- return f"Video ID: {video_id}. Direct video analysis unavailable due to access restrictions."
102
 
103
- async def _handle_excel_question(self, question: str) -> str:
104
- """Handle questions that require Excel file analysis"""
105
- # Extract file path from question if present
106
- file_patterns = [r'([A-Za-z]:\\[^\s]+\.xlsx?)', r'([^\s]+\.xlsx?)']
107
- file_path = None
108
-
109
- for pattern in file_patterns:
110
- match = re.search(pattern, question)
111
- if match:
112
- file_path = match.group(1)
113
  break
 
 
 
 
 
 
 
 
 
 
114
 
115
- if not file_path:
116
- return "Please provide Excel file path in your question."
117
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  try:
119
- if 'sales' in question.lower() and 'food' in question.lower():
120
- results = self.excel_parser.analyze_sales_data(file_path)
121
- return results.get('total_food_sales', 'No sales data found')
122
- else:
123
- df = self.excel_parser.read_excel_file(file_path)
124
- return f"Excel file loaded with {len(df)} rows and {len(df.columns)} columns."
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  except Exception as e:
127
- return f"Excel analysis failed: {str(e)}"
128
 
129
- async def _handle_text_question(self, question: str) -> str:
130
- """Handle regular text-based questions"""
131
- # Create a more focused prompt for concise answers
132
- prompt = f"""Answer this question directly and concisely. Provide only the essential information requested, not explanations or step-by-step reasoning unless specifically asked.
133
-
134
- Question: {question}
135
-
136
- Answer:"""
137
-
138
- # Prepare the request payload for Nova Pro
139
- payload = {
140
- "messages": [
141
- {
142
- "role": "user",
143
- "content": [{
144
- "text": prompt
145
- }]
146
- }
147
- ],
148
- "inferenceConfig": {
149
- "max_new_tokens": 250,
150
- "temperature": 0.0
151
- }
152
- }
153
-
154
- # Call Nova Pro model
155
- response = self.bedrock_client.invoke_model(
156
- modelId=self.model_id,
157
- contentType=self.content_type,
158
- accept=self.accept,
159
- body=json.dumps(payload)
160
- )
161
-
162
- # Parse response
163
- response_body = json.loads(response['body'].read())
164
- answer = response_body['output']['message']['content'][0]['text']
165
-
166
- # Clean up the answer
167
- answer = answer.strip()
168
-
169
- # Remove verbose beginnings
170
- verbose_starts = [
171
- "To answer this question",
172
- "Based on the information",
173
- "According to",
174
- "The answer is",
175
- "Looking at"
176
- ]
177
-
178
- for start in verbose_starts:
179
- if answer.lower().startswith(start.lower()):
180
- sentences = answer.split('. ')
181
- for sentence in sentences[1:]:
182
- if len(sentence.strip()) > 10:
183
- answer = sentence.strip()
184
- break
185
-
186
- # Limit length
187
- if len(answer) > 200:
188
- sentences = answer.split('. ')
189
- answer = sentences[0] + '.'
190
-
191
- return answer
 
1
+ import cv2
2
+ import requests
3
+ import tempfile
4
  import os
5
+ from urllib.parse import urlparse, parse_qs
6
+ import yt_dlp
 
 
 
 
7
 
8
+ class VideoParser:
 
 
9
  def __init__(self):
10
+ self.temp_dir = tempfile.mkdtemp()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ def download_youtube_video(self, url: str) -> str:
13
+ """Download YouTube video and return local path"""
14
+ ydl_opts = {
15
+ 'format': 'worst[height<=480]/worst',
16
+ 'outtmpl': os.path.join(self.temp_dir, '%(title)s.%(ext)s'),
17
+ 'quiet': True,
18
+ 'no_warnings': True,
19
+ 'extract_flat': False,
20
+ 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  }
22
 
23
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
24
+ info = ydl.extract_info(url, download=True)
25
+ return ydl.prepare_filename(info)
 
 
 
 
 
 
 
 
 
 
26
 
27
+ def analyze_video_frames(self, video_path: str, sample_rate: int = 30):
28
+ """Analyze video frames for object detection/counting"""
29
+ cap = cv2.VideoCapture(video_path)
30
+ frame_count = 0
31
+ results = []
32
+
33
+ while cap.isOpened():
34
+ ret, frame = cap.read()
35
+ if not ret:
 
36
  break
37
+
38
+ if frame_count % sample_rate == 0:
39
+ # Basic frame analysis - you'd integrate with object detection here
40
+ results.append({
41
+ 'frame': frame_count,
42
+ 'timestamp': frame_count / cap.get(cv2.CAP_PROP_FPS),
43
+ 'frame_data': frame
44
+ })
45
+
46
+ frame_count += 1
47
 
48
+ cap.release()
49
+ return results
50
+
51
+ def extract_audio(self, video_path: str) -> str:
52
+ """Extract audio from video for speech analysis"""
53
+ audio_path = video_path.rsplit('.', 1)[0] + '.wav'
54
+
55
+ # Use ffmpeg to extract audio
56
+ import subprocess
57
+ subprocess.run([
58
+ 'ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le',
59
+ '-ar', '16000', '-ac', '1', audio_path, '-y'
60
+ ], capture_output=True)
61
+
62
+ return audio_path
63
+
64
+ def get_youtube_metadata(self, url: str) -> dict:
65
+ """Extract YouTube video metadata without downloading"""
66
  try:
67
+ ydl_opts = {
68
+ 'quiet': True,
69
+ 'no_download': True,
70
+ 'extract_flat': False
71
+ }
72
+
73
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
74
+ info = ydl.extract_info(url, download=False)
75
+
76
+ return {
77
+ 'title': info.get('title', 'Unknown'),
78
+ 'description': info.get('description', '')[:500],
79
+ 'duration': info.get('duration', 0),
80
+ 'view_count': info.get('view_count', 0),
81
+ 'upload_date': info.get('upload_date', 'Unknown'),
82
+ 'uploader': info.get('uploader', 'Unknown')
83
+ }
84
 
85
  except Exception as e:
86
+ return {'error': str(e)}
87
 
88
+ def cleanup(self):
89
+ """Clean up temporary files"""
90
+ import shutil
91
+ shutil.rmtree(self.temp_dir, ignore_errors=True)