Spaces:

Abhinav-hf
/

Second_DeepLearning_Model

Sleeping

App Files Files Community

Abhinav-hf commited on Dec 16, 2024

Commit

209ff06

verified ·

1 Parent(s): 112e3ad

Update Backend/OCR/Dynamic/VideoOCR.py

Browse files

Files changed (1) hide show

Backend/OCR/Dynamic/VideoOCR.py +212 -73

Backend/OCR/Dynamic/VideoOCR.py CHANGED Viewed

@@ -1,33 +1,53 @@
 import cv2
 import numpy as np
-def frame_similarity_detection(video_path, threshold=350*1E4, scale_factor=0.45, output_video_path="non_similar_frames_output.mp4"):
     # Open the video file
     cap = cv2.VideoCapture(video_path)
     # Get the total number of frames in the video
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     print(f"Total number of frames in the video: {total_frames}")
     # Initialize variables
     prev_frame = None
     frame_count = 0
     non_similar_frames = []
     frame_list = []  # List to store frames that are non-similar
-    # Open the output video file
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for mp4 format
-    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    # Resize the frame dimensions
-    resized_width = 640
-    resized_height = 480
-    out = cv2.VideoWriter(output_video_path, fourcc, cap.get(cv2.CAP_PROP_FPS), (resized_width, resized_height))
-    # To store the first frame in case all frames are similar
-    first_frame = None
     while True:
         ret, frame = cap.read()
@@ -36,16 +56,52 @@ def frame_similarity_detection(video_path, threshold=350*1E4, scale_factor=0.45,
         frame_count += 1
-        # Resize the frame to reduce resolution
         resized_frame = cv2.resize(frame, (resized_width, resized_height))
-        # Save the first frame to be used later if needed
-        if frame_count == 1:
-            first_frame = resized_frame
         # Convert frame to grayscale (for faster processing)
         gray_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2GRAY)
         if prev_frame is not None:
             # Compute absolute difference between current and previous frame
             frame_diff = cv2.absdiff(prev_frame, gray_frame)
@@ -56,30 +112,33 @@ def frame_similarity_detection(video_path, threshold=350*1E4, scale_factor=0.45,
             # If the difference is above the threshold, consider frames as non-similar
             if diff_sum > threshold:
                 non_similar_frames.append(frame_count)  # Save the frame number
-                frame_list.append(resized_frame)  # Store the non-similar frame in the list
         # Set the current frame as the previous frame for the next iteration
         prev_frame = gray_frame
     # If no non-similar frames were detected, add the first frame
-    if not non_similar_frames and first_frame is not None:
-        frame_list.append(first_frame)  # Add the first frame to the list
-    # Write the non-similar frames (or the single frame if no non-similar frames) to the output video file
     for frame in frame_list:
         out.write(frame)
-    # Release the video capture and writer objects
     cap.release()
     out.release()
-    cv2.destroyAllWindows()  # Close all OpenCV windows
     # Print the list of frames that are not similar
     if non_similar_frames:
-        print(f"Frames not similar (above difference threshold of {threshold}): {non_similar_frames}")
         print(f"Output video saved as: {output_video_path}")
     else:
-        print(f"All frames are similar (below difference threshold of {threshold}). One frame has been included.")
     print(f"Total non-similar frames: {len(non_similar_frames)}")
@@ -95,8 +154,9 @@ import gradio as gr
 import google.generativeai as genai
 import pandas as pd
 # from google.colab import userdata
-# use_gpu = paddle.device.is_compiled_with_cuda() and paddle.device.get_device().startswith("gpu")
 # Define paths
 ocr = PaddleOCR(use_angle_cls=True, lang='en')
@@ -105,65 +165,132 @@ ocr = PaddleOCR(use_angle_cls=True, lang='en')
 GOOGLE_API_KEY = os.getenv("GEMINI_API")
 genai.configure(api_key=GOOGLE_API_KEY)
-# Function to add branding to a frame
-def add_branding(frame, text="Abhinav Video OCR", position=(50, 50), font_scale=2, font_thickness=3,
-                 text_color=(255, 255, 255), bg_color=(0, 0, 0)):
     overlay = frame.copy()
     alpha = 0.6  # Transparency factor
     # Get the width and height of the text box
     (text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness)
-    x, y = position
-    # Draw a rectangle and put the text on it
-    cv2.rectangle(overlay, (x, y + 10), (x + text_width, y - text_height - 10), bg_color, -1)
     cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame)
-    cv2.putText(frame, text, position, cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness)
     return frame
 # Function to preprocess the frame for OCR
-def preprocess_frame(frame, resize_width=600):
-    resized = cv2.resize(frame, (resize_width, int(frame.shape[0] * (resize_width / frame.shape[1]))))
-    gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
-    return gray, resized
 def parse_gemini_response(response_text):
     parsed_data = {
         "Manufacturing Date": "",
         "Expiry Date": "",
-        "MRP Details": ""
     }
     for line in response_text.split("\n"):
         if line.startswith("Manufacturing Date:"):
-            parsed_data["Manufacturing Date"] = line.split("Manufacturing Date:")[1].strip()
         elif line.startswith("Expiry Date:"):
-            parsed_data["Expiry Date"] = line.split("Expiry Date:")[1].strip()
-        elif line.startswith("MRP Details:"):
-            parsed_data["MRP Details"] = line.split("MRP Details:")[1].strip()
     return parsed_data
 # Function to call Gemini LLM for date predictions
 def call_gemini_llm_for_dates(text):
     # Use the previously set up Gemini model for predictions
     model = genai.GenerativeModel('models/gemini-1.5-flash')
     prompt = f"""
-    You are provided with extracted words from a product's packaging. Based on this text, your task is to predict the manufacturing and expiry dates of the product.
     Please follow these rules:
     - If only one date is present, consider it to be the expiry date.
     - If the dates are detected as only Month and Year, provide them in the format MM/YYYY.
     - Ignore any noise or irrelevant information.
     - Predict the most logical manufacturing and expiry dates based on the context provided.
-    - Output the dates strictly in the format:
         Manufacturing Date: DD/MM/YYYY or MM/YYYY
         Expiry Date: DD/MM/YYYY or MM/YYYY
-    - Do not generate any other information or text besides the two dates.
     Here is the extracted text:
     {text}
     """
     # Send the prompt to Gemini model and get the response
     response = model.generate_content(prompt)
     print(response.text)
@@ -172,6 +299,8 @@ def call_gemini_llm_for_dates(text):
 def gradio_video_ocr_processing(video_file):
     input_video_path = video_file
     output_video_path = "annotated_video.mp4"
@@ -181,9 +310,8 @@ def gradio_video_ocr_processing(video_file):
     # Step 1: Frame similarity detection
     print("[DEBUG] Detecting non-similar frames.")
-    non_similar_frames, frame_diff_video_path = frame_similarity_detection(input_video_path)
-    # if len(non_similar_frames) > 100:
-    #   non_similar_frames = non_similar_frames[::2]  # Select every alternate frame
     # Step 2: OCR processing and saving the results
     cap = cv2.VideoCapture(input_video_path)
     if not cap.isOpened():
@@ -197,10 +325,10 @@ def gradio_video_ocr_processing(video_file):
     out = None
     frame_skip = 2
-    resize_width = 600
     detected_words = [["Frame", "Word", "Confidence", "X", "Y", "Width", "Height"]]
     frame_count = 0
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
@@ -213,33 +341,46 @@ def gradio_video_ocr_processing(video_file):
             continue  # Skip similar frames
         # Preprocess frame
-        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-        resized_frame = cv2.resize(gray, (resize_width, int(frame.shape[0] * resize_width / frame.shape[1])))
         print(f"[DEBUG] Processing frame {frame_count}.")
         # OCR processing with PaddleOCR
         results = ocr.ocr(resized_frame)
         if results[0] is not None:
-            for line in results[0]:
-                word, confidence = line[1][0], float(line[1][1])
-                if confidence > 0.7:
-                    bbox = line[0]
-                    x_min, y_min = int(bbox[0][0]), int(bbox[0][1])
-                    x_max, y_max = int(bbox[2][0]), int(bbox[2][1])
-                    detected_words.append([frame_count, word, confidence, x_min, y_min, x_max - x_min, y_max - y_min])
-                    # Annotate the frame
-                    frame = cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
-                    frame = cv2.putText(frame, f"{word} ({confidence:.2f})", (x_min, y_min - 10),
-                                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
         else:
-            print(f"[DEBUG] No text detected in frame {frame_count}.")
-        frame = add_branding(frame)
         if out is None:
-          out = cv2.VideoWriter(output_video_path, fourcc, input_frame_rate,
-                      (frame.shape[1], frame.shape[0]))
         out.write(frame)
         frame_count += 1
@@ -266,5 +407,3 @@ def gradio_video_ocr_processing(video_file):
     print("[DEBUG] Gemini response generated.")
     return output_video_path, gemini_response, parsed_output

 import cv2
 import numpy as np
+def frame_similarity_detection(video_path, scale_factor=0.45, output_video_path="non_similar_frames_output.mp4", target_frames=120):
     # Open the video file
     cap = cv2.VideoCapture(video_path)
     # Get the total number of frames in the video
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))  # Get original width
+    original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))  # Get original height
     print(f"Total number of frames in the video: {total_frames}")
+    print(f"Original video resolution: {original_width}x{original_height}")
+    # If the total frames are less than the target, handle it gracefully
+    if total_frames <= target_frames:
+        print(f"Total frames ({total_frames}) are less than or equal to the target frames ({target_frames}). "
+              "All frames will be considered non-similar.")
+        frame_list = []
+        # Open output video writer with original resolution
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for mp4 format
+        out = cv2.VideoWriter(output_video_path, fourcc, cap.get(cv2.CAP_PROP_FPS), (original_width, original_height))
+        # Write all frames to the output video
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame_list.append(frame)  # Keep original resolution
+            out.write(frame)
+        # Release resources
+        cap.release()
+        out.release()
+        cv2.destroyAllWindows()
+        print(f"Output video saved with all {total_frames} frames as non-similar.")
+        return list(range(1, total_frames + 1)), output_video_path
     # Initialize variables
     prev_frame = None
     frame_count = 0
     non_similar_frames = []
     frame_list = []  # List to store frames that are non-similar
+    frame_differences = []  # List to store the sum of frame differences
+    # Resize the frame dimensions for processing (not output)
+    resized_width = int(original_width * scale_factor)
+    resized_height = int(original_height * scale_factor)
     while True:
         ret, frame = cap.read()
         frame_count += 1
+        # Resize the frame to reduce resolution for faster processing
         resized_frame = cv2.resize(frame, (resized_width, resized_height))
         # Convert frame to grayscale (for faster processing)
         gray_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2GRAY)
+        if prev_frame is not None:
+            # Compute absolute difference between current and previous frame
+            frame_diff = cv2.absdiff(prev_frame, gray_frame)
+            # Calculate the sum of differences
+            diff_sum = np.sum(frame_diff)
+            frame_differences.append(diff_sum)  # Store the difference sum
+        # Set the current frame as the previous frame for the next iteration
+        prev_frame = gray_frame
+    # Release video capture to free memory
+    cap.release()
+    # Determine threshold dynamically to get close to target frames
+    frame_differences.sort(reverse=True)  # Sort differences in descending order
+    if len(frame_differences) >= target_frames:
+        threshold = frame_differences[target_frames - 1]  # Get the threshold for the 120th largest difference
+    else:
+        threshold = frame_differences[-1] if frame_differences else 0  # Fallback to smallest difference
+    print(f"Calculated threshold for approximately {target_frames} frames: {threshold}")
+    # Reopen the video to process frames again with the determined threshold
+    cap = cv2.VideoCapture(video_path)
+    frame_count = 0
+    prev_frame = None
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for mp4 format
+    out = cv2.VideoWriter(output_video_path, fourcc, cap.get(cv2.CAP_PROP_FPS), (original_width, original_height))
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame_count += 1
+        resized_frame = cv2.resize(frame, (resized_width, resized_height))
+        gray_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2GRAY)
         if prev_frame is not None:
             # Compute absolute difference between current and previous frame
             frame_diff = cv2.absdiff(prev_frame, gray_frame)
             # If the difference is above the threshold, consider frames as non-similar
             if diff_sum > threshold:
                 non_similar_frames.append(frame_count)  # Save the frame number
+                frame_list.append(frame)  # Store the non-similar frame with original resolution
         # Set the current frame as the previous frame for the next iteration
         prev_frame = gray_frame
     # If no non-similar frames were detected, add the first frame
+    if not non_similar_frames and total_frames > 0:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, 0)  # Go back to the first frame
+        ret, first_frame = cap.read()
+        if ret:
+            frame_list.append(first_frame)
+    # Write the non-similar frames to the output video file
     for frame in frame_list:
         out.write(frame)
+    # Release the video writer objects
     cap.release()
     out.release()
+    cv2.destroyAllWindows()
     # Print the list of frames that are not similar
     if non_similar_frames:
+        print(f"Frames not similar (above dynamic threshold of {threshold}): {non_similar_frames}")
         print(f"Output video saved as: {output_video_path}")
     else:
+        print(f"All frames are similar. One frame has been included.")
     print(f"Total non-similar frames: {len(non_similar_frames)}")
 import google.generativeai as genai
 import pandas as pd
 # from google.colab import userdata
+from datetime import datetime
 # Define paths
 ocr = PaddleOCR(use_angle_cls=True, lang='en')
 GOOGLE_API_KEY = os.getenv("GEMINI_API")
 genai.configure(api_key=GOOGLE_API_KEY)
+# Adjusted branding function to map back to original resolution
+def add_branding(frame, text="Annotated Video OCR", position=(50, 50), font_scale=2, font_thickness=3,
+                 text_color=(255, 255, 255), bg_color=(0, 0, 0), original_resolution=None):
+    # Use the original resolution for branding position
+    if original_resolution:
+        # Map position back to the original resolution
+        original_width, original_height = original_resolution
+        x, y = position
+        x = int(x * (original_width / frame.shape[1]))
+        y = int(y * (original_height / frame.shape[0]))
     overlay = frame.copy()
     alpha = 0.6  # Transparency factor
     # Get the width and height of the text box
     (text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness)
+    x_end = x + text_width + 10  # Add padding to the right
+    y_end = y + text_height + 10  # Add padding to the bottom
+    # Ensure that the rectangle and text are within the frame boundaries
+    if x_end > frame.shape[1]:  # Check for overflow horizontally
+        x = frame.shape[1] - text_width - 10
+        x_end = frame.shape[1]  # Adjust the end point of the rectangle
+    if y_end > frame.shape[0]:  # Check for overflow vertically
+        y = frame.shape[0] - text_height - 10
+        y_end = frame.shape[0]  # Adjust the end point of the rectangle
+    # Draw a filled rectangle for background
+    cv2.rectangle(overlay, (x, y), (x_end, y_end), bg_color, -1)
+    # Add the overlay (with transparency)
     cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame)
+    # Draw the text
+    cv2.putText(frame, text, (x + 5, y + text_height + 5), cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness)
     return frame
 # Function to preprocess the frame for OCR
+def preprocess_frame(frame, resize_width=600, resize_height=None, grayscale=True):
+    # Store original resolution
+    original_height, original_width = frame.shape[:2]
+    print("[INFO] Original Height: ", original_height, "[INFO] Original Width: ", original_width)
+    # If resize_height is provided, resize both width and height independently
+    if resize_height is not None:
+        resized = cv2.resize(frame, (resize_width, resize_height))
+    else:
+        # Otherwise, resize only based on the width to maintain aspect ratio
+        resized = cv2.resize(frame, (resize_width, int(frame.shape[0] * (resize_width / frame.shape[1]))))
+    # Convert to grayscale if the grayscale flag is True
+    if grayscale:
+        resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
+    # Return both the resized frame and the original resolution for later use
+    return resized, (original_width, original_height)
 def parse_gemini_response(response_text):
+    def standardize_date(date_str):
+        """Convert date into DD/MM/YYYY format."""
+        try:
+            if "/" in date_str:
+                parts = date_str.split("/")
+                # If the format is MM/YYYY, append '01' as the day
+                if len(parts) == 2:
+                    month = datetime.strptime(parts[0], "%b").month if len(parts[0]) == 3 else int(parts[0])
+                    return f"01/{month:02d}/{parts[1]}"
+                # If the format is DD/MM/YYYY, return as is
+                elif len(parts) == 3:
+                    day, month, year = parts
+                    return f"{int(day):02d}/{int(month):02d}/{int(year)}"
+            return date_str  # Return as is if it doesn't match expected patterns
+        except Exception:
+            return date_str  # Fallback to original string if parsing fails
     parsed_data = {
         "Manufacturing Date": "",
         "Expiry Date": "",
+        "MRP": ""
     }
     for line in response_text.split("\n"):
         if line.startswith("Manufacturing Date:"):
+            raw_date = line.split("Manufacturing Date:")[1].strip()
+            parsed_data["Manufacturing Date"] = standardize_date(raw_date)
         elif line.startswith("Expiry Date:"):
+            raw_date = line.split("Expiry Date:")[1].strip()
+            parsed_data["Expiry Date"] = standardize_date(raw_date)
+        elif line.startswith("MRP:"):
+            parsed_data["MRP"] = line.split("MRP:")[1].strip()
     return parsed_data
 # Function to call Gemini LLM for date predictions
 def call_gemini_llm_for_dates(text):
     # Use the previously set up Gemini model for predictions
     model = genai.GenerativeModel('models/gemini-1.5-flash')
     prompt = f"""
+    You are provided with extracted words from a product's packaging. Based on this text, your task is to predict the manufacturing and expiry dates of the product, and extract the MRP details.
     Please follow these rules:
     - If only one date is present, consider it to be the expiry date.
     - If the dates are detected as only Month and Year, provide them in the format MM/YYYY.
     - Ignore any noise or irrelevant information.
     - Predict the most logical manufacturing and expiry dates based on the context provided.
+    - For MRP:
+        - Extract the value listed as the MRP, considering symbols like "₹", "Rs.", or "MRP".
+        - If no MRP is detected, output "MRP: Not available".
+    - Output the details strictly in the format:
         Manufacturing Date: DD/MM/YYYY or MM/YYYY
         Expiry Date: DD/MM/YYYY or MM/YYYY
+        MRP: ₹<value> or "Not available"
+    - Do not generate any other information or text besides the requested details.
     Here is the extracted text:
     {text}
     """
     # Send the prompt to Gemini model and get the response
     response = model.generate_content(prompt)
     print(response.text)
 def gradio_video_ocr_processing(video_file):
     input_video_path = video_file
     output_video_path = "annotated_video.mp4"
     # Step 1: Frame similarity detection
     print("[DEBUG] Detecting non-similar frames.")
+    non_similar_frames,frame_diff_video_path  = frame_similarity_detection(input_video_path)
     # Step 2: OCR processing and saving the results
     cap = cv2.VideoCapture(input_video_path)
     if not cap.isOpened():
     out = None
     frame_skip = 2
     detected_words = [["Frame", "Word", "Confidence", "X", "Y", "Width", "Height"]]
     frame_count = 0
+    resize_width=600
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             continue  # Skip similar frames
         # Preprocess frame
+        resized_frame, original_resolution = preprocess_frame(frame, resize_width)
         print(f"[DEBUG] Processing frame {frame_count}.")
         # OCR processing with PaddleOCR
         results = ocr.ocr(resized_frame)
         if results[0] is not None:
+          for line in results[0]:
+              word, confidence = line[1][0], float(line[1][1])
+              if confidence > 0.7:
+                  bbox = line[0]
+                  # Get bounding box coordinates in the resized frame
+                  x_min_resized, y_min_resized = int(bbox[0][0]), int(bbox[0][1])
+                  x_max_resized, y_max_resized = int(bbox[2][0]), int(bbox[2][1])
+                  original_width, original_height=original_resolution
+                  resized_height = (original_height/original_width)*resize_width
+                  # Rescale the bounding box back to the original resolution
+                  x_min = int(x_min_resized * (original_width / resize_width))
+                  y_min = int(y_min_resized * (original_height / resized_height))
+                  x_max = int(x_max_resized * (original_width / resize_width))
+                  y_max = int(y_max_resized * (original_height / resized_height))
+                  detected_words.append([frame_count, word, confidence, x_min, y_min, x_max - x_min, y_max - y_min])
+                  # Annotate the frame with the detected text box on the original resolution
+                  frame = cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
+                  frame = cv2.putText(frame, f"{word} ({confidence:.2f})", (x_min, y_min - 10),
+                                      cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
         else:
+          print(f"[DEBUG] No text detected in frame {frame_count}.")
+        frame = add_branding(frame, original_resolution=original_resolution)
+        # Add branding to the frame using the original resolution for correct placement
         if out is None:
+            out = cv2.VideoWriter(output_video_path, fourcc, input_frame_rate,
+                                  (frame.shape[1], frame.shape[0]))
         out.write(frame)
         frame_count += 1
     print("[DEBUG] Gemini response generated.")
     return output_video_path, gemini_response, parsed_output