File size: 15,086 Bytes
f8ed977
 
 
b570e1b
f8ed977
 
 
 
 
209ff06
 
f8ed977
209ff06
 
 
 
 
 
b570e1b
209ff06
b570e1b
209ff06
b570e1b
f8ed977
 
 
 
 
209ff06
f8ed977
209ff06
 
 
f8ed977
 
 
 
 
 
 
 
209ff06
f8ed977
 
 
 
 
209ff06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8ed977
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209ff06
b570e1b
f8ed977
 
 
209ff06
f8ed977
209ff06
f8ed977
 
 
b570e1b
 
f8ed977
 
 
 
 
 
 
 
 
 
 
209ff06
 
f8ed977
 
112e3ad
f8ed977
 
 
 
 
 
209ff06
 
 
 
 
 
 
 
 
 
 
 
f8ed977
 
 
 
 
209ff06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8ed977
209ff06
 
 
f8ed977
 
 
209ff06
f8ed977
209ff06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8ed977
209ff06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8ed977
 
 
209ff06
f8ed977
209ff06
f8ed977
 
209ff06
 
f8ed977
209ff06
 
 
 
 
f8ed977
 
209ff06
 
f8ed977
 
 
 
 
209ff06
f8ed977
 
 
 
 
 
209ff06
 
 
 
f8ed977
 
209ff06
 
f8ed977
 
 
 
 
 
209ff06
f8ed977
 
 
 
 
 
 
 
209ff06
 
f8ed977
 
 
 
 
 
 
 
 
b338f2d
209ff06
f8ed977
 
 
 
 
 
 
 
 
 
 
 
 
209ff06
f8ed977
 
209ff06
f8ed977
 
 
 
 
 
 
 
 
 
 
 
209ff06
f8ed977
 
 
 
 
 
209ff06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8ed977
209ff06
 
 
 
 
f8ed977
 
209ff06
 
f8ed977
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
import cv2
import numpy as np

def frame_similarity_detection(video_path, scale_factor=0.45, target_frames=120):
    # Open the video file
    cap = cv2.VideoCapture(video_path)

    # Get the total number of frames in the video
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))  # Get original width
    original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))  # Get original height
    print(f"Total number of frames in the video: {total_frames}")
    print(f"Original video resolution: {original_width}x{original_height}")

    # If the total frames are less than the target, handle it gracefully
    if total_frames <= target_frames:
        print(f"Total frames ({total_frames}) are less than or equal to the target frames ({target_frames}). "
              "All frames will be considered non-similar.")
        non_similar_frames = list(range(1, total_frames + 1))  # Consider all frames as non-similar

        # Release resources and return the non-similar frames
        cap.release()
        return non_similar_frames

    # Initialize variables
    prev_frame = None
    frame_count = 0
    non_similar_frames = []
    frame_differences = []  # List to store the sum of frame differences

    # Resize the frame dimensions for processing (not output)
    resized_width = int(original_width * scale_factor)
    resized_height = int(original_height * scale_factor)

    while True:
        ret, frame = cap.read()
        if not ret:
            break  # Break the loop if no more frames are available
        
        frame_count += 1

        # Resize the frame to reduce resolution for faster processing
        resized_frame = cv2.resize(frame, (resized_width, resized_height))

        # Convert frame to grayscale (for faster processing)
        gray_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2GRAY)

        if prev_frame is not None:
            # Compute absolute difference between current and previous frame
            frame_diff = cv2.absdiff(prev_frame, gray_frame)

            # Calculate the sum of differences
            diff_sum = np.sum(frame_diff)
            frame_differences.append(diff_sum)  # Store the difference sum

        # Set the current frame as the previous frame for the next iteration
        prev_frame = gray_frame

    # Release video capture to free memory
    cap.release()

    # Determine threshold dynamically to get close to target frames
    frame_differences.sort(reverse=True)  # Sort differences in descending order
    if len(frame_differences) >= target_frames:
        threshold = frame_differences[target_frames - 1]  # Get the threshold for the 120th largest difference
    else:
        threshold = frame_differences[-1] if frame_differences else 0  # Fallback to smallest difference

    print(f"Calculated threshold for approximately {target_frames} frames: {threshold}")

    # Reopen the video to process frames again with the determined threshold
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    prev_frame = None

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        resized_frame = cv2.resize(frame, (resized_width, resized_height))
        gray_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2GRAY)

        if prev_frame is not None:
            # Compute absolute difference between current and previous frame
            frame_diff = cv2.absdiff(prev_frame, gray_frame)

            # Calculate the sum of differences
            diff_sum = np.sum(frame_diff)

            # If the difference is above the threshold, consider frames as non-similar
            if diff_sum > threshold:
                non_similar_frames.append(frame_count)  # Save the frame number

        # Set the current frame as the previous frame for the next iteration
        prev_frame = gray_frame

    # If no non-similar frames were detected, add the first frame
    if not non_similar_frames and total_frames > 0:
        non_similar_frames.append(1)  # Consider the first frame as non-similar

    # Print the list of frames that are not similar
    if non_similar_frames:
        print(f"Frames not similar (above dynamic threshold of {threshold}): {non_similar_frames}")
    else:
        print(f"All frames are similar. One frame has been included.")

    print(f"Total non-similar frames: {len(non_similar_frames)}")

    return non_similar_frames

# Import necessary libraries
import cv2
from paddleocr import PaddleOCR, draw_ocr
# import paddle
import os
import csv
import numpy as np
import gradio as gr
import google.generativeai as genai
import pandas as pd
# from google.colab import userdata
from datetime import datetime


# Define paths
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# GOOGLE_API_KEY = os.getenv("GEMINI_API")

GOOGLE_API_KEY = os.getenv("GEMINI_API")
genai.configure(api_key=GOOGLE_API_KEY)

# Adjusted branding function to map back to original resolution
def add_branding(frame, text="Annotated Video OCR", position=(50, 50), font_scale=2, font_thickness=3,
                 text_color=(255, 255, 255), bg_color=(0, 0, 0), original_resolution=None):
    
    # Use the original resolution for branding position
    if original_resolution:
        # Map position back to the original resolution
        original_width, original_height = original_resolution
        x, y = position
        x = int(x * (original_width / frame.shape[1]))
        y = int(y * (original_height / frame.shape[0]))
    
    overlay = frame.copy()
    alpha = 0.6  # Transparency factor

    # Get the width and height of the text box
    (text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness)
    x_end = x + text_width + 10  # Add padding to the right
    y_end = y + text_height + 10  # Add padding to the bottom

    # Ensure that the rectangle and text are within the frame boundaries
    if x_end > frame.shape[1]:  # Check for overflow horizontally
        x = frame.shape[1] - text_width - 10
        x_end = frame.shape[1]  # Adjust the end point of the rectangle
    if y_end > frame.shape[0]:  # Check for overflow vertically
        y = frame.shape[0] - text_height - 10
        y_end = frame.shape[0]  # Adjust the end point of the rectangle

    # Draw a filled rectangle for background
    cv2.rectangle(overlay, (x, y), (x_end, y_end), bg_color, -1)
    
    # Add the overlay (with transparency)
    cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame)
    
    # Draw the text
    cv2.putText(frame, text, (x + 5, y + text_height + 5), cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness)

    return frame


# Function to preprocess the frame for OCR
def preprocess_frame(frame, resize_width=600, resize_height=None, grayscale=True):
    # Store original resolution
    original_height, original_width = frame.shape[:2]
    print("[INFO] Original Height: ", original_height, "[INFO] Original Width: ", original_width)

    # If resize_height is provided, resize both width and height independently
    if resize_height is not None:
        resized = cv2.resize(frame, (resize_width, resize_height))
    else:
        # Otherwise, resize only based on the width to maintain aspect ratio
        resized = cv2.resize(frame, (resize_width, int(frame.shape[0] * (resize_width / frame.shape[1]))))

    # Convert to grayscale if the grayscale flag is True
    if grayscale:
        resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
    
    # Return both the resized frame and the original resolution for later use
    return resized, (original_width, original_height)


def parse_gemini_response(response_text):
    def standardize_date(date_str):
        """Convert date into DD/MM/YYYY format."""
        try:
            if "/" in date_str:
                parts = date_str.split("/")
                # If the format is MM/YYYY, append '01' as the day
                if len(parts) == 2:
                    month = datetime.strptime(parts[0], "%b").month if len(parts[0]) == 3 else int(parts[0])
                    return f"01/{month:02d}/{parts[1]}"
                # If the format is DD/MM/YYYY, return as is
                elif len(parts) == 3:
                    day, month, year = parts
                    return f"{int(day):02d}/{int(month):02d}/{int(year)}"
            return date_str  # Return as is if it doesn't match expected patterns
        except Exception:
            return date_str  # Fallback to original string if parsing fails

    parsed_data = {
        "Manufacturing Date": "",
        "Expiry Date": "",
        "MRP": ""
    }

    for line in response_text.split("\n"):
        if line.startswith("Manufacturing Date:"):
            raw_date = line.split("Manufacturing Date:")[1].strip()
            parsed_data["Manufacturing Date"] = standardize_date(raw_date)
        elif line.startswith("Expiry Date:"):
            raw_date = line.split("Expiry Date:")[1].strip()
            parsed_data["Expiry Date"] = standardize_date(raw_date)
        elif line.startswith("MRP:"):
            parsed_data["MRP"] = line.split("MRP:")[1].strip()

    return parsed_data



# Function to call Gemini LLM for date predictions
def call_gemini_llm_for_dates(text):
    # Use the previously set up Gemini model for predictions
    model = genai.GenerativeModel('models/gemini-1.5-flash')
    prompt = f"""
    You are provided with extracted words from a product's packaging. Based on this text, your task is to predict the manufacturing and expiry dates of the product, and extract the MRP details.

    Please follow these rules:
    - If only one date is present, consider it to be the expiry date.
    - If the dates are detected as only Month and Year, provide them in the format MM/YYYY.
    - Ignore any noise or irrelevant information.
    - Predict the most logical manufacturing and expiry dates based on the context provided.
    - For MRP:
        - Extract the value listed as the MRP, considering symbols like "₹", "Rs.", or "MRP".
        - If no MRP is detected, output "MRP: Not available".
    - Output the details strictly in the format:
        Manufacturing Date: DD/MM/YYYY or MM/YYYY
        Expiry Date: DD/MM/YYYY or MM/YYYY
        MRP: ₹<value> or "Not available"
    - Do not generate any other information or text besides the requested details.

    Here is the extracted text:
    {text}
    """



    # Send the prompt to Gemini model and get the response
    response = model.generate_content(prompt)
    print(response.text)

    return response.text.strip()





def gradio_video_ocr_processing(video_file):
    input_video_path = video_file
    output_video_path = "annotated_video.mp4"
    output_text_file = "detected_words.csv"

    print("[DEBUG] Starting video processing.")

    # Step 1: Frame similarity detection
    print("[DEBUG] Detecting non-similar frames.")
    non_similar_frames  = frame_similarity_detection(input_video_path)

    # Step 2: OCR processing and saving the results
    cap = cv2.VideoCapture(input_video_path)
    if not cap.isOpened():
        print("[ERROR] Cannot open video file.")
        return None, "Error: Cannot open video file."

    input_frame_rate = cap.get(cv2.CAP_PROP_FPS)
    print(f"[DEBUG] Input video frame rate: {input_frame_rate} FPS.")

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = None

    frame_skip = 2
  
    detected_words = [["Frame", "Word", "Confidence", "X", "Y", "Width", "Height"]]
    frame_count = 0
    resize_width=600    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            print("[DEBUG] End of video stream.")
            break

        # Only process non-similar frames
        if frame_count not in non_similar_frames:
            frame_count += 1
            continue  # Skip similar frames

        # Preprocess frame
        resized_frame, original_resolution = preprocess_frame(frame, resize_width)

        print(f"[DEBUG] Processing frame {frame_count}.")

        # OCR processing with PaddleOCR
        results = ocr.ocr(resized_frame)
        if results[0] is not None:
          for line in results[0]:
              word, confidence = line[1][0], float(line[1][1])
              if confidence > 0.7:
                  bbox = line[0]
                  
                  # Get bounding box coordinates in the resized frame
                  x_min_resized, y_min_resized = int(bbox[0][0]), int(bbox[0][1])
                  x_max_resized, y_max_resized = int(bbox[2][0]), int(bbox[2][1])
                  
                  original_width, original_height=original_resolution
                  resized_height = (original_height/original_width)*resize_width
                  # Rescale the bounding box back to the original resolution
                  x_min = int(x_min_resized * (original_width / resize_width))
                  y_min = int(y_min_resized * (original_height / resized_height))
                  x_max = int(x_max_resized * (original_width / resize_width))
                  y_max = int(y_max_resized * (original_height / resized_height))
                  
                  detected_words.append([frame_count, word, confidence, x_min, y_min, x_max - x_min, y_max - y_min])

                  # Annotate the frame with the detected text box on the original resolution
                  frame = cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                  frame = cv2.putText(frame, f"{word} ({confidence:.2f})", (x_min, y_min - 10),
                                      cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
        else:
          print(f"[DEBUG] No text detected in frame {frame_count}.")
        frame = add_branding(frame, original_resolution=original_resolution)

        # Add branding to the frame using the original resolution for correct placement
        

        if out is None:
            out = cv2.VideoWriter(output_video_path, fourcc, input_frame_rate, 
                                  (frame.shape[1], frame.shape[0]))
        out.write(frame)
        frame_count += 1

    cap.release()
    if out is not None:
        out.release()
    cv2.destroyAllWindows()

    # Save detected words to CSV
    with open(output_text_file, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerows(detected_words)
    print(f"[INFO] Detected words saved to {output_text_file}.")
    print(f"[INFO] Annotated video saved to {output_video_path}.")

    # Generate Gemini response
    ocr_results_df = pd.read_csv(output_text_file)
    ocr_results_df_clean = ocr_results_df.drop_duplicates(subset='Word', keep='first')  # Clean the duplicates in "Word" column

    detected_text = " ".join(ocr_results_df_clean['Word'].dropna())
    gemini_response = call_gemini_llm_for_dates(detected_text)
    parsed_output = parse_gemini_response(gemini_response)

    print("[DEBUG] Gemini response generated.")
    return output_video_path, gemini_response, parsed_output