File size: 15,086 Bytes
f8ed977 b570e1b f8ed977 209ff06 f8ed977 209ff06 b570e1b 209ff06 b570e1b 209ff06 b570e1b f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 b570e1b f8ed977 209ff06 f8ed977 209ff06 f8ed977 b570e1b f8ed977 209ff06 f8ed977 112e3ad f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 b338f2d 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 209ff06 f8ed977 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 | import cv2
import numpy as np
def frame_similarity_detection(video_path, scale_factor=0.45, target_frames=120):
# Open the video file
cap = cv2.VideoCapture(video_path)
# Get the total number of frames in the video
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) # Get original width
original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # Get original height
print(f"Total number of frames in the video: {total_frames}")
print(f"Original video resolution: {original_width}x{original_height}")
# If the total frames are less than the target, handle it gracefully
if total_frames <= target_frames:
print(f"Total frames ({total_frames}) are less than or equal to the target frames ({target_frames}). "
"All frames will be considered non-similar.")
non_similar_frames = list(range(1, total_frames + 1)) # Consider all frames as non-similar
# Release resources and return the non-similar frames
cap.release()
return non_similar_frames
# Initialize variables
prev_frame = None
frame_count = 0
non_similar_frames = []
frame_differences = [] # List to store the sum of frame differences
# Resize the frame dimensions for processing (not output)
resized_width = int(original_width * scale_factor)
resized_height = int(original_height * scale_factor)
while True:
ret, frame = cap.read()
if not ret:
break # Break the loop if no more frames are available
frame_count += 1
# Resize the frame to reduce resolution for faster processing
resized_frame = cv2.resize(frame, (resized_width, resized_height))
# Convert frame to grayscale (for faster processing)
gray_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2GRAY)
if prev_frame is not None:
# Compute absolute difference between current and previous frame
frame_diff = cv2.absdiff(prev_frame, gray_frame)
# Calculate the sum of differences
diff_sum = np.sum(frame_diff)
frame_differences.append(diff_sum) # Store the difference sum
# Set the current frame as the previous frame for the next iteration
prev_frame = gray_frame
# Release video capture to free memory
cap.release()
# Determine threshold dynamically to get close to target frames
frame_differences.sort(reverse=True) # Sort differences in descending order
if len(frame_differences) >= target_frames:
threshold = frame_differences[target_frames - 1] # Get the threshold for the 120th largest difference
else:
threshold = frame_differences[-1] if frame_differences else 0 # Fallback to smallest difference
print(f"Calculated threshold for approximately {target_frames} frames: {threshold}")
# Reopen the video to process frames again with the determined threshold
cap = cv2.VideoCapture(video_path)
frame_count = 0
prev_frame = None
while True:
ret, frame = cap.read()
if not ret:
break
frame_count += 1
resized_frame = cv2.resize(frame, (resized_width, resized_height))
gray_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2GRAY)
if prev_frame is not None:
# Compute absolute difference between current and previous frame
frame_diff = cv2.absdiff(prev_frame, gray_frame)
# Calculate the sum of differences
diff_sum = np.sum(frame_diff)
# If the difference is above the threshold, consider frames as non-similar
if diff_sum > threshold:
non_similar_frames.append(frame_count) # Save the frame number
# Set the current frame as the previous frame for the next iteration
prev_frame = gray_frame
# If no non-similar frames were detected, add the first frame
if not non_similar_frames and total_frames > 0:
non_similar_frames.append(1) # Consider the first frame as non-similar
# Print the list of frames that are not similar
if non_similar_frames:
print(f"Frames not similar (above dynamic threshold of {threshold}): {non_similar_frames}")
else:
print(f"All frames are similar. One frame has been included.")
print(f"Total non-similar frames: {len(non_similar_frames)}")
return non_similar_frames
# Import necessary libraries
import cv2
from paddleocr import PaddleOCR, draw_ocr
# import paddle
import os
import csv
import numpy as np
import gradio as gr
import google.generativeai as genai
import pandas as pd
# from google.colab import userdata
from datetime import datetime
# Define paths
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# GOOGLE_API_KEY = os.getenv("GEMINI_API")
GOOGLE_API_KEY = os.getenv("GEMINI_API")
genai.configure(api_key=GOOGLE_API_KEY)
# Adjusted branding function to map back to original resolution
def add_branding(frame, text="Annotated Video OCR", position=(50, 50), font_scale=2, font_thickness=3,
text_color=(255, 255, 255), bg_color=(0, 0, 0), original_resolution=None):
# Use the original resolution for branding position
if original_resolution:
# Map position back to the original resolution
original_width, original_height = original_resolution
x, y = position
x = int(x * (original_width / frame.shape[1]))
y = int(y * (original_height / frame.shape[0]))
overlay = frame.copy()
alpha = 0.6 # Transparency factor
# Get the width and height of the text box
(text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness)
x_end = x + text_width + 10 # Add padding to the right
y_end = y + text_height + 10 # Add padding to the bottom
# Ensure that the rectangle and text are within the frame boundaries
if x_end > frame.shape[1]: # Check for overflow horizontally
x = frame.shape[1] - text_width - 10
x_end = frame.shape[1] # Adjust the end point of the rectangle
if y_end > frame.shape[0]: # Check for overflow vertically
y = frame.shape[0] - text_height - 10
y_end = frame.shape[0] # Adjust the end point of the rectangle
# Draw a filled rectangle for background
cv2.rectangle(overlay, (x, y), (x_end, y_end), bg_color, -1)
# Add the overlay (with transparency)
cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame)
# Draw the text
cv2.putText(frame, text, (x + 5, y + text_height + 5), cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness)
return frame
# Function to preprocess the frame for OCR
def preprocess_frame(frame, resize_width=600, resize_height=None, grayscale=True):
# Store original resolution
original_height, original_width = frame.shape[:2]
print("[INFO] Original Height: ", original_height, "[INFO] Original Width: ", original_width)
# If resize_height is provided, resize both width and height independently
if resize_height is not None:
resized = cv2.resize(frame, (resize_width, resize_height))
else:
# Otherwise, resize only based on the width to maintain aspect ratio
resized = cv2.resize(frame, (resize_width, int(frame.shape[0] * (resize_width / frame.shape[1]))))
# Convert to grayscale if the grayscale flag is True
if grayscale:
resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
# Return both the resized frame and the original resolution for later use
return resized, (original_width, original_height)
def parse_gemini_response(response_text):
def standardize_date(date_str):
"""Convert date into DD/MM/YYYY format."""
try:
if "/" in date_str:
parts = date_str.split("/")
# If the format is MM/YYYY, append '01' as the day
if len(parts) == 2:
month = datetime.strptime(parts[0], "%b").month if len(parts[0]) == 3 else int(parts[0])
return f"01/{month:02d}/{parts[1]}"
# If the format is DD/MM/YYYY, return as is
elif len(parts) == 3:
day, month, year = parts
return f"{int(day):02d}/{int(month):02d}/{int(year)}"
return date_str # Return as is if it doesn't match expected patterns
except Exception:
return date_str # Fallback to original string if parsing fails
parsed_data = {
"Manufacturing Date": "",
"Expiry Date": "",
"MRP": ""
}
for line in response_text.split("\n"):
if line.startswith("Manufacturing Date:"):
raw_date = line.split("Manufacturing Date:")[1].strip()
parsed_data["Manufacturing Date"] = standardize_date(raw_date)
elif line.startswith("Expiry Date:"):
raw_date = line.split("Expiry Date:")[1].strip()
parsed_data["Expiry Date"] = standardize_date(raw_date)
elif line.startswith("MRP:"):
parsed_data["MRP"] = line.split("MRP:")[1].strip()
return parsed_data
# Function to call Gemini LLM for date predictions
def call_gemini_llm_for_dates(text):
# Use the previously set up Gemini model for predictions
model = genai.GenerativeModel('models/gemini-1.5-flash')
prompt = f"""
You are provided with extracted words from a product's packaging. Based on this text, your task is to predict the manufacturing and expiry dates of the product, and extract the MRP details.
Please follow these rules:
- If only one date is present, consider it to be the expiry date.
- If the dates are detected as only Month and Year, provide them in the format MM/YYYY.
- Ignore any noise or irrelevant information.
- Predict the most logical manufacturing and expiry dates based on the context provided.
- For MRP:
- Extract the value listed as the MRP, considering symbols like "₹", "Rs.", or "MRP".
- If no MRP is detected, output "MRP: Not available".
- Output the details strictly in the format:
Manufacturing Date: DD/MM/YYYY or MM/YYYY
Expiry Date: DD/MM/YYYY or MM/YYYY
MRP: ₹<value> or "Not available"
- Do not generate any other information or text besides the requested details.
Here is the extracted text:
{text}
"""
# Send the prompt to Gemini model and get the response
response = model.generate_content(prompt)
print(response.text)
return response.text.strip()
def gradio_video_ocr_processing(video_file):
input_video_path = video_file
output_video_path = "annotated_video.mp4"
output_text_file = "detected_words.csv"
print("[DEBUG] Starting video processing.")
# Step 1: Frame similarity detection
print("[DEBUG] Detecting non-similar frames.")
non_similar_frames = frame_similarity_detection(input_video_path)
# Step 2: OCR processing and saving the results
cap = cv2.VideoCapture(input_video_path)
if not cap.isOpened():
print("[ERROR] Cannot open video file.")
return None, "Error: Cannot open video file."
input_frame_rate = cap.get(cv2.CAP_PROP_FPS)
print(f"[DEBUG] Input video frame rate: {input_frame_rate} FPS.")
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = None
frame_skip = 2
detected_words = [["Frame", "Word", "Confidence", "X", "Y", "Width", "Height"]]
frame_count = 0
resize_width=600
while cap.isOpened():
ret, frame = cap.read()
if not ret:
print("[DEBUG] End of video stream.")
break
# Only process non-similar frames
if frame_count not in non_similar_frames:
frame_count += 1
continue # Skip similar frames
# Preprocess frame
resized_frame, original_resolution = preprocess_frame(frame, resize_width)
print(f"[DEBUG] Processing frame {frame_count}.")
# OCR processing with PaddleOCR
results = ocr.ocr(resized_frame)
if results[0] is not None:
for line in results[0]:
word, confidence = line[1][0], float(line[1][1])
if confidence > 0.7:
bbox = line[0]
# Get bounding box coordinates in the resized frame
x_min_resized, y_min_resized = int(bbox[0][0]), int(bbox[0][1])
x_max_resized, y_max_resized = int(bbox[2][0]), int(bbox[2][1])
original_width, original_height=original_resolution
resized_height = (original_height/original_width)*resize_width
# Rescale the bounding box back to the original resolution
x_min = int(x_min_resized * (original_width / resize_width))
y_min = int(y_min_resized * (original_height / resized_height))
x_max = int(x_max_resized * (original_width / resize_width))
y_max = int(y_max_resized * (original_height / resized_height))
detected_words.append([frame_count, word, confidence, x_min, y_min, x_max - x_min, y_max - y_min])
# Annotate the frame with the detected text box on the original resolution
frame = cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
frame = cv2.putText(frame, f"{word} ({confidence:.2f})", (x_min, y_min - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
else:
print(f"[DEBUG] No text detected in frame {frame_count}.")
frame = add_branding(frame, original_resolution=original_resolution)
# Add branding to the frame using the original resolution for correct placement
if out is None:
out = cv2.VideoWriter(output_video_path, fourcc, input_frame_rate,
(frame.shape[1], frame.shape[0]))
out.write(frame)
frame_count += 1
cap.release()
if out is not None:
out.release()
cv2.destroyAllWindows()
# Save detected words to CSV
with open(output_text_file, 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerows(detected_words)
print(f"[INFO] Detected words saved to {output_text_file}.")
print(f"[INFO] Annotated video saved to {output_video_path}.")
# Generate Gemini response
ocr_results_df = pd.read_csv(output_text_file)
ocr_results_df_clean = ocr_results_df.drop_duplicates(subset='Word', keep='first') # Clean the duplicates in "Word" column
detected_text = " ".join(ocr_results_df_clean['Word'].dropna())
gemini_response = call_gemini_llm_for_dates(detected_text)
parsed_output = parse_gemini_response(gemini_response)
print("[DEBUG] Gemini response generated.")
return output_video_path, gemini_response, parsed_output
|