Abhinav-hf commited on
Commit
209ff06
·
verified ·
1 Parent(s): 112e3ad

Update Backend/OCR/Dynamic/VideoOCR.py

Browse files
Files changed (1) hide show
  1. Backend/OCR/Dynamic/VideoOCR.py +212 -73
Backend/OCR/Dynamic/VideoOCR.py CHANGED
@@ -1,33 +1,53 @@
1
  import cv2
2
  import numpy as np
3
 
4
- def frame_similarity_detection(video_path, threshold=350*1E4, scale_factor=0.45, output_video_path="non_similar_frames_output.mp4"):
5
  # Open the video file
6
  cap = cv2.VideoCapture(video_path)
7
 
8
  # Get the total number of frames in the video
9
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 
 
10
  print(f"Total number of frames in the video: {total_frames}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  # Initialize variables
13
  prev_frame = None
14
  frame_count = 0
15
  non_similar_frames = []
16
  frame_list = [] # List to store frames that are non-similar
 
17
 
18
- # Open the output video file
19
- fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for mp4 format
20
- frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
21
- frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
22
-
23
- # Resize the frame dimensions
24
- resized_width = 640
25
- resized_height = 480
26
-
27
- out = cv2.VideoWriter(output_video_path, fourcc, cap.get(cv2.CAP_PROP_FPS), (resized_width, resized_height))
28
-
29
- # To store the first frame in case all frames are similar
30
- first_frame = None
31
 
32
  while True:
33
  ret, frame = cap.read()
@@ -36,16 +56,52 @@ def frame_similarity_detection(video_path, threshold=350*1E4, scale_factor=0.45,
36
 
37
  frame_count += 1
38
 
39
- # Resize the frame to reduce resolution
40
  resized_frame = cv2.resize(frame, (resized_width, resized_height))
41
 
42
- # Save the first frame to be used later if needed
43
- if frame_count == 1:
44
- first_frame = resized_frame
45
-
46
  # Convert frame to grayscale (for faster processing)
47
  gray_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2GRAY)
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  if prev_frame is not None:
50
  # Compute absolute difference between current and previous frame
51
  frame_diff = cv2.absdiff(prev_frame, gray_frame)
@@ -56,30 +112,33 @@ def frame_similarity_detection(video_path, threshold=350*1E4, scale_factor=0.45,
56
  # If the difference is above the threshold, consider frames as non-similar
57
  if diff_sum > threshold:
58
  non_similar_frames.append(frame_count) # Save the frame number
59
- frame_list.append(resized_frame) # Store the non-similar frame in the list
60
 
61
  # Set the current frame as the previous frame for the next iteration
62
  prev_frame = gray_frame
63
 
64
  # If no non-similar frames were detected, add the first frame
65
- if not non_similar_frames and first_frame is not None:
66
- frame_list.append(first_frame) # Add the first frame to the list
 
 
 
67
 
68
- # Write the non-similar frames (or the single frame if no non-similar frames) to the output video file
69
  for frame in frame_list:
70
  out.write(frame)
71
 
72
- # Release the video capture and writer objects
73
  cap.release()
74
  out.release()
75
- cv2.destroyAllWindows() # Close all OpenCV windows
76
 
77
  # Print the list of frames that are not similar
78
  if non_similar_frames:
79
- print(f"Frames not similar (above difference threshold of {threshold}): {non_similar_frames}")
80
  print(f"Output video saved as: {output_video_path}")
81
  else:
82
- print(f"All frames are similar (below difference threshold of {threshold}). One frame has been included.")
83
 
84
  print(f"Total non-similar frames: {len(non_similar_frames)}")
85
 
@@ -95,8 +154,9 @@ import gradio as gr
95
  import google.generativeai as genai
96
  import pandas as pd
97
  # from google.colab import userdata
 
 
98
 
99
- # use_gpu = paddle.device.is_compiled_with_cuda() and paddle.device.get_device().startswith("gpu")
100
  # Define paths
101
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
102
 
@@ -105,65 +165,132 @@ ocr = PaddleOCR(use_angle_cls=True, lang='en')
105
  GOOGLE_API_KEY = os.getenv("GEMINI_API")
106
  genai.configure(api_key=GOOGLE_API_KEY)
107
 
108
- # Function to add branding to a frame
109
- def add_branding(frame, text="Abhinav Video OCR", position=(50, 50), font_scale=2, font_thickness=3,
110
- text_color=(255, 255, 255), bg_color=(0, 0, 0)):
 
 
 
 
 
 
 
 
 
111
  overlay = frame.copy()
112
  alpha = 0.6 # Transparency factor
113
 
114
  # Get the width and height of the text box
115
  (text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness)
116
- x, y = position
117
-
118
- # Draw a rectangle and put the text on it
119
- cv2.rectangle(overlay, (x, y + 10), (x + text_width, y - text_height - 10), bg_color, -1)
 
 
 
 
 
 
 
 
 
 
 
120
  cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame)
121
- cv2.putText(frame, text, position, cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness)
 
 
122
 
123
  return frame
124
 
 
125
  # Function to preprocess the frame for OCR
126
- def preprocess_frame(frame, resize_width=600):
127
- resized = cv2.resize(frame, (resize_width, int(frame.shape[0] * (resize_width / frame.shape[1]))))
128
- gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
129
- return gray, resized
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def parse_gemini_response(response_text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  parsed_data = {
132
  "Manufacturing Date": "",
133
  "Expiry Date": "",
134
- "MRP Details": ""
135
  }
 
136
  for line in response_text.split("\n"):
137
  if line.startswith("Manufacturing Date:"):
138
- parsed_data["Manufacturing Date"] = line.split("Manufacturing Date:")[1].strip()
 
139
  elif line.startswith("Expiry Date:"):
140
- parsed_data["Expiry Date"] = line.split("Expiry Date:")[1].strip()
141
- elif line.startswith("MRP Details:"):
142
- parsed_data["MRP Details"] = line.split("MRP Details:")[1].strip()
 
 
143
  return parsed_data
144
 
 
 
145
  # Function to call Gemini LLM for date predictions
146
  def call_gemini_llm_for_dates(text):
147
  # Use the previously set up Gemini model for predictions
148
  model = genai.GenerativeModel('models/gemini-1.5-flash')
149
  prompt = f"""
150
- You are provided with extracted words from a product's packaging. Based on this text, your task is to predict the manufacturing and expiry dates of the product.
151
 
152
  Please follow these rules:
153
  - If only one date is present, consider it to be the expiry date.
154
  - If the dates are detected as only Month and Year, provide them in the format MM/YYYY.
155
  - Ignore any noise or irrelevant information.
156
  - Predict the most logical manufacturing and expiry dates based on the context provided.
157
- - Output the dates strictly in the format:
 
 
 
158
  Manufacturing Date: DD/MM/YYYY or MM/YYYY
159
  Expiry Date: DD/MM/YYYY or MM/YYYY
160
- - Do not generate any other information or text besides the two dates.
 
161
 
162
  Here is the extracted text:
163
  {text}
164
  """
165
 
166
 
 
167
  # Send the prompt to Gemini model and get the response
168
  response = model.generate_content(prompt)
169
  print(response.text)
@@ -172,6 +299,8 @@ def call_gemini_llm_for_dates(text):
172
 
173
 
174
 
 
 
175
  def gradio_video_ocr_processing(video_file):
176
  input_video_path = video_file
177
  output_video_path = "annotated_video.mp4"
@@ -181,9 +310,8 @@ def gradio_video_ocr_processing(video_file):
181
 
182
  # Step 1: Frame similarity detection
183
  print("[DEBUG] Detecting non-similar frames.")
184
- non_similar_frames, frame_diff_video_path = frame_similarity_detection(input_video_path)
185
- # if len(non_similar_frames) > 100:
186
- # non_similar_frames = non_similar_frames[::2] # Select every alternate frame
187
  # Step 2: OCR processing and saving the results
188
  cap = cv2.VideoCapture(input_video_path)
189
  if not cap.isOpened():
@@ -197,10 +325,10 @@ def gradio_video_ocr_processing(video_file):
197
  out = None
198
 
199
  frame_skip = 2
200
- resize_width = 600
201
  detected_words = [["Frame", "Word", "Confidence", "X", "Y", "Width", "Height"]]
202
  frame_count = 0
203
-
204
  while cap.isOpened():
205
  ret, frame = cap.read()
206
  if not ret:
@@ -213,33 +341,46 @@ def gradio_video_ocr_processing(video_file):
213
  continue # Skip similar frames
214
 
215
  # Preprocess frame
216
- gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
217
- resized_frame = cv2.resize(gray, (resize_width, int(frame.shape[0] * resize_width / frame.shape[1])))
218
 
219
  print(f"[DEBUG] Processing frame {frame_count}.")
220
 
221
  # OCR processing with PaddleOCR
222
  results = ocr.ocr(resized_frame)
223
  if results[0] is not None:
224
- for line in results[0]:
225
- word, confidence = line[1][0], float(line[1][1])
226
- if confidence > 0.7:
227
- bbox = line[0]
228
- x_min, y_min = int(bbox[0][0]), int(bbox[0][1])
229
- x_max, y_max = int(bbox[2][0]), int(bbox[2][1])
230
- detected_words.append([frame_count, word, confidence, x_min, y_min, x_max - x_min, y_max - y_min])
231
-
232
- # Annotate the frame
233
- frame = cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
234
- frame = cv2.putText(frame, f"{word} ({confidence:.2f})", (x_min, y_min - 10),
235
- cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
 
 
 
 
 
 
 
 
 
 
 
236
  else:
237
- print(f"[DEBUG] No text detected in frame {frame_count}.")
 
 
 
 
238
 
239
- frame = add_branding(frame)
240
  if out is None:
241
- out = cv2.VideoWriter(output_video_path, fourcc, input_frame_rate,
242
- (frame.shape[1], frame.shape[0]))
243
  out.write(frame)
244
  frame_count += 1
245
 
@@ -266,5 +407,3 @@ def gradio_video_ocr_processing(video_file):
266
  print("[DEBUG] Gemini response generated.")
267
  return output_video_path, gemini_response, parsed_output
268
 
269
-
270
-
 
1
  import cv2
2
  import numpy as np
3
 
4
+ def frame_similarity_detection(video_path, scale_factor=0.45, output_video_path="non_similar_frames_output.mp4", target_frames=120):
5
  # Open the video file
6
  cap = cv2.VideoCapture(video_path)
7
 
8
  # Get the total number of frames in the video
9
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
10
+ original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) # Get original width
11
+ original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # Get original height
12
  print(f"Total number of frames in the video: {total_frames}")
13
+ print(f"Original video resolution: {original_width}x{original_height}")
14
+
15
+ # If the total frames are less than the target, handle it gracefully
16
+ if total_frames <= target_frames:
17
+ print(f"Total frames ({total_frames}) are less than or equal to the target frames ({target_frames}). "
18
+ "All frames will be considered non-similar.")
19
+ frame_list = []
20
+
21
+ # Open output video writer with original resolution
22
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for mp4 format
23
+ out = cv2.VideoWriter(output_video_path, fourcc, cap.get(cv2.CAP_PROP_FPS), (original_width, original_height))
24
+
25
+ # Write all frames to the output video
26
+ while True:
27
+ ret, frame = cap.read()
28
+ if not ret:
29
+ break
30
+ frame_list.append(frame) # Keep original resolution
31
+ out.write(frame)
32
+
33
+ # Release resources
34
+ cap.release()
35
+ out.release()
36
+ cv2.destroyAllWindows()
37
+
38
+ print(f"Output video saved with all {total_frames} frames as non-similar.")
39
+ return list(range(1, total_frames + 1)), output_video_path
40
 
41
  # Initialize variables
42
  prev_frame = None
43
  frame_count = 0
44
  non_similar_frames = []
45
  frame_list = [] # List to store frames that are non-similar
46
+ frame_differences = [] # List to store the sum of frame differences
47
 
48
+ # Resize the frame dimensions for processing (not output)
49
+ resized_width = int(original_width * scale_factor)
50
+ resized_height = int(original_height * scale_factor)
 
 
 
 
 
 
 
 
 
 
51
 
52
  while True:
53
  ret, frame = cap.read()
 
56
 
57
  frame_count += 1
58
 
59
+ # Resize the frame to reduce resolution for faster processing
60
  resized_frame = cv2.resize(frame, (resized_width, resized_height))
61
 
 
 
 
 
62
  # Convert frame to grayscale (for faster processing)
63
  gray_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2GRAY)
64
 
65
+ if prev_frame is not None:
66
+ # Compute absolute difference between current and previous frame
67
+ frame_diff = cv2.absdiff(prev_frame, gray_frame)
68
+
69
+ # Calculate the sum of differences
70
+ diff_sum = np.sum(frame_diff)
71
+ frame_differences.append(diff_sum) # Store the difference sum
72
+
73
+ # Set the current frame as the previous frame for the next iteration
74
+ prev_frame = gray_frame
75
+
76
+ # Release video capture to free memory
77
+ cap.release()
78
+
79
+ # Determine threshold dynamically to get close to target frames
80
+ frame_differences.sort(reverse=True) # Sort differences in descending order
81
+ if len(frame_differences) >= target_frames:
82
+ threshold = frame_differences[target_frames - 1] # Get the threshold for the 120th largest difference
83
+ else:
84
+ threshold = frame_differences[-1] if frame_differences else 0 # Fallback to smallest difference
85
+
86
+ print(f"Calculated threshold for approximately {target_frames} frames: {threshold}")
87
+
88
+ # Reopen the video to process frames again with the determined threshold
89
+ cap = cv2.VideoCapture(video_path)
90
+ frame_count = 0
91
+ prev_frame = None
92
+
93
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for mp4 format
94
+ out = cv2.VideoWriter(output_video_path, fourcc, cap.get(cv2.CAP_PROP_FPS), (original_width, original_height))
95
+
96
+ while True:
97
+ ret, frame = cap.read()
98
+ if not ret:
99
+ break
100
+
101
+ frame_count += 1
102
+ resized_frame = cv2.resize(frame, (resized_width, resized_height))
103
+ gray_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2GRAY)
104
+
105
  if prev_frame is not None:
106
  # Compute absolute difference between current and previous frame
107
  frame_diff = cv2.absdiff(prev_frame, gray_frame)
 
112
  # If the difference is above the threshold, consider frames as non-similar
113
  if diff_sum > threshold:
114
  non_similar_frames.append(frame_count) # Save the frame number
115
+ frame_list.append(frame) # Store the non-similar frame with original resolution
116
 
117
  # Set the current frame as the previous frame for the next iteration
118
  prev_frame = gray_frame
119
 
120
  # If no non-similar frames were detected, add the first frame
121
+ if not non_similar_frames and total_frames > 0:
122
+ cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # Go back to the first frame
123
+ ret, first_frame = cap.read()
124
+ if ret:
125
+ frame_list.append(first_frame)
126
 
127
+ # Write the non-similar frames to the output video file
128
  for frame in frame_list:
129
  out.write(frame)
130
 
131
+ # Release the video writer objects
132
  cap.release()
133
  out.release()
134
+ cv2.destroyAllWindows()
135
 
136
  # Print the list of frames that are not similar
137
  if non_similar_frames:
138
+ print(f"Frames not similar (above dynamic threshold of {threshold}): {non_similar_frames}")
139
  print(f"Output video saved as: {output_video_path}")
140
  else:
141
+ print(f"All frames are similar. One frame has been included.")
142
 
143
  print(f"Total non-similar frames: {len(non_similar_frames)}")
144
 
 
154
  import google.generativeai as genai
155
  import pandas as pd
156
  # from google.colab import userdata
157
+ from datetime import datetime
158
+
159
 
 
160
  # Define paths
161
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
162
 
 
165
  GOOGLE_API_KEY = os.getenv("GEMINI_API")
166
  genai.configure(api_key=GOOGLE_API_KEY)
167
 
168
+ # Adjusted branding function to map back to original resolution
169
+ def add_branding(frame, text="Annotated Video OCR", position=(50, 50), font_scale=2, font_thickness=3,
170
+ text_color=(255, 255, 255), bg_color=(0, 0, 0), original_resolution=None):
171
+
172
+ # Use the original resolution for branding position
173
+ if original_resolution:
174
+ # Map position back to the original resolution
175
+ original_width, original_height = original_resolution
176
+ x, y = position
177
+ x = int(x * (original_width / frame.shape[1]))
178
+ y = int(y * (original_height / frame.shape[0]))
179
+
180
  overlay = frame.copy()
181
  alpha = 0.6 # Transparency factor
182
 
183
  # Get the width and height of the text box
184
  (text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness)
185
+ x_end = x + text_width + 10 # Add padding to the right
186
+ y_end = y + text_height + 10 # Add padding to the bottom
187
+
188
+ # Ensure that the rectangle and text are within the frame boundaries
189
+ if x_end > frame.shape[1]: # Check for overflow horizontally
190
+ x = frame.shape[1] - text_width - 10
191
+ x_end = frame.shape[1] # Adjust the end point of the rectangle
192
+ if y_end > frame.shape[0]: # Check for overflow vertically
193
+ y = frame.shape[0] - text_height - 10
194
+ y_end = frame.shape[0] # Adjust the end point of the rectangle
195
+
196
+ # Draw a filled rectangle for background
197
+ cv2.rectangle(overlay, (x, y), (x_end, y_end), bg_color, -1)
198
+
199
+ # Add the overlay (with transparency)
200
  cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame)
201
+
202
+ # Draw the text
203
+ cv2.putText(frame, text, (x + 5, y + text_height + 5), cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness)
204
 
205
  return frame
206
 
207
+
208
  # Function to preprocess the frame for OCR
209
+ def preprocess_frame(frame, resize_width=600, resize_height=None, grayscale=True):
210
+ # Store original resolution
211
+ original_height, original_width = frame.shape[:2]
212
+ print("[INFO] Original Height: ", original_height, "[INFO] Original Width: ", original_width)
213
+
214
+ # If resize_height is provided, resize both width and height independently
215
+ if resize_height is not None:
216
+ resized = cv2.resize(frame, (resize_width, resize_height))
217
+ else:
218
+ # Otherwise, resize only based on the width to maintain aspect ratio
219
+ resized = cv2.resize(frame, (resize_width, int(frame.shape[0] * (resize_width / frame.shape[1]))))
220
+
221
+ # Convert to grayscale if the grayscale flag is True
222
+ if grayscale:
223
+ resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
224
+
225
+ # Return both the resized frame and the original resolution for later use
226
+ return resized, (original_width, original_height)
227
+
228
+
229
  def parse_gemini_response(response_text):
230
+ def standardize_date(date_str):
231
+ """Convert date into DD/MM/YYYY format."""
232
+ try:
233
+ if "/" in date_str:
234
+ parts = date_str.split("/")
235
+ # If the format is MM/YYYY, append '01' as the day
236
+ if len(parts) == 2:
237
+ month = datetime.strptime(parts[0], "%b").month if len(parts[0]) == 3 else int(parts[0])
238
+ return f"01/{month:02d}/{parts[1]}"
239
+ # If the format is DD/MM/YYYY, return as is
240
+ elif len(parts) == 3:
241
+ day, month, year = parts
242
+ return f"{int(day):02d}/{int(month):02d}/{int(year)}"
243
+ return date_str # Return as is if it doesn't match expected patterns
244
+ except Exception:
245
+ return date_str # Fallback to original string if parsing fails
246
+
247
  parsed_data = {
248
  "Manufacturing Date": "",
249
  "Expiry Date": "",
250
+ "MRP": ""
251
  }
252
+
253
  for line in response_text.split("\n"):
254
  if line.startswith("Manufacturing Date:"):
255
+ raw_date = line.split("Manufacturing Date:")[1].strip()
256
+ parsed_data["Manufacturing Date"] = standardize_date(raw_date)
257
  elif line.startswith("Expiry Date:"):
258
+ raw_date = line.split("Expiry Date:")[1].strip()
259
+ parsed_data["Expiry Date"] = standardize_date(raw_date)
260
+ elif line.startswith("MRP:"):
261
+ parsed_data["MRP"] = line.split("MRP:")[1].strip()
262
+
263
  return parsed_data
264
 
265
+
266
+
267
  # Function to call Gemini LLM for date predictions
268
  def call_gemini_llm_for_dates(text):
269
  # Use the previously set up Gemini model for predictions
270
  model = genai.GenerativeModel('models/gemini-1.5-flash')
271
  prompt = f"""
272
+ You are provided with extracted words from a product's packaging. Based on this text, your task is to predict the manufacturing and expiry dates of the product, and extract the MRP details.
273
 
274
  Please follow these rules:
275
  - If only one date is present, consider it to be the expiry date.
276
  - If the dates are detected as only Month and Year, provide them in the format MM/YYYY.
277
  - Ignore any noise or irrelevant information.
278
  - Predict the most logical manufacturing and expiry dates based on the context provided.
279
+ - For MRP:
280
+ - Extract the value listed as the MRP, considering symbols like "₹", "Rs.", or "MRP".
281
+ - If no MRP is detected, output "MRP: Not available".
282
+ - Output the details strictly in the format:
283
  Manufacturing Date: DD/MM/YYYY or MM/YYYY
284
  Expiry Date: DD/MM/YYYY or MM/YYYY
285
+ MRP: ₹<value> or "Not available"
286
+ - Do not generate any other information or text besides the requested details.
287
 
288
  Here is the extracted text:
289
  {text}
290
  """
291
 
292
 
293
+
294
  # Send the prompt to Gemini model and get the response
295
  response = model.generate_content(prompt)
296
  print(response.text)
 
299
 
300
 
301
 
302
+
303
+
304
  def gradio_video_ocr_processing(video_file):
305
  input_video_path = video_file
306
  output_video_path = "annotated_video.mp4"
 
310
 
311
  # Step 1: Frame similarity detection
312
  print("[DEBUG] Detecting non-similar frames.")
313
+ non_similar_frames,frame_diff_video_path = frame_similarity_detection(input_video_path)
314
+
 
315
  # Step 2: OCR processing and saving the results
316
  cap = cv2.VideoCapture(input_video_path)
317
  if not cap.isOpened():
 
325
  out = None
326
 
327
  frame_skip = 2
328
+
329
  detected_words = [["Frame", "Word", "Confidence", "X", "Y", "Width", "Height"]]
330
  frame_count = 0
331
+ resize_width=600
332
  while cap.isOpened():
333
  ret, frame = cap.read()
334
  if not ret:
 
341
  continue # Skip similar frames
342
 
343
  # Preprocess frame
344
+ resized_frame, original_resolution = preprocess_frame(frame, resize_width)
 
345
 
346
  print(f"[DEBUG] Processing frame {frame_count}.")
347
 
348
  # OCR processing with PaddleOCR
349
  results = ocr.ocr(resized_frame)
350
  if results[0] is not None:
351
+ for line in results[0]:
352
+ word, confidence = line[1][0], float(line[1][1])
353
+ if confidence > 0.7:
354
+ bbox = line[0]
355
+
356
+ # Get bounding box coordinates in the resized frame
357
+ x_min_resized, y_min_resized = int(bbox[0][0]), int(bbox[0][1])
358
+ x_max_resized, y_max_resized = int(bbox[2][0]), int(bbox[2][1])
359
+
360
+ original_width, original_height=original_resolution
361
+ resized_height = (original_height/original_width)*resize_width
362
+ # Rescale the bounding box back to the original resolution
363
+ x_min = int(x_min_resized * (original_width / resize_width))
364
+ y_min = int(y_min_resized * (original_height / resized_height))
365
+ x_max = int(x_max_resized * (original_width / resize_width))
366
+ y_max = int(y_max_resized * (original_height / resized_height))
367
+
368
+ detected_words.append([frame_count, word, confidence, x_min, y_min, x_max - x_min, y_max - y_min])
369
+
370
+ # Annotate the frame with the detected text box on the original resolution
371
+ frame = cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
372
+ frame = cv2.putText(frame, f"{word} ({confidence:.2f})", (x_min, y_min - 10),
373
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
374
  else:
375
+ print(f"[DEBUG] No text detected in frame {frame_count}.")
376
+ frame = add_branding(frame, original_resolution=original_resolution)
377
+
378
+ # Add branding to the frame using the original resolution for correct placement
379
+
380
 
 
381
  if out is None:
382
+ out = cv2.VideoWriter(output_video_path, fourcc, input_frame_rate,
383
+ (frame.shape[1], frame.shape[0]))
384
  out.write(frame)
385
  frame_count += 1
386
 
 
407
  print("[DEBUG] Gemini response generated.")
408
  return output_video_path, gemini_response, parsed_output
409