Abhinav-hf commited on
Commit
f8ed977
·
verified ·
1 Parent(s): b23c835

Update Backend/OCR/Dynamic/VideoOCR.py

Browse files
Files changed (1) hide show
  1. Backend/OCR/Dynamic/VideoOCR.py +271 -168
Backend/OCR/Dynamic/VideoOCR.py CHANGED
@@ -1,168 +1,271 @@
1
- # Import necessary libraries
2
- import cv2
3
- from paddleocr import PaddleOCR, draw_ocr
4
-
5
- import os
6
- import csv
7
- import numpy as np
8
- import gradio as gr
9
- import google.generativeai as genai
10
- import pandas as pd
11
-
12
-
13
- # Define paths
14
- ocr = PaddleOCR(use_angle_cls=True, lang='en')
15
- GOOGLE_API_KEY = os.getenv("GEMINI_API")
16
- genai.configure(api_key=GOOGLE_API_KEY)
17
-
18
-
19
- # Function to add branding to a frame
20
- def add_branding(frame, text="Abhinav Video OCR", position=(50, 50), font_scale=2, font_thickness=3,
21
- text_color=(255, 255, 255), bg_color=(0, 0, 0)):
22
- overlay = frame.copy()
23
- alpha = 0.6 # Transparency factor
24
-
25
- # Get the width and height of the text box
26
- (text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness)
27
- x, y = position
28
-
29
- # Draw a rectangle and put the text on it
30
- cv2.rectangle(overlay, (x, y + 10), (x + text_width, y - text_height - 10), bg_color, -1)
31
- cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame)
32
- cv2.putText(frame, text, position, cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness)
33
-
34
- return frame
35
-
36
- # Function to preprocess the frame for OCR
37
- def preprocess_frame(frame, resize_width=600):
38
- resized = cv2.resize(frame, (resize_width, int(frame.shape[0] * (resize_width / frame.shape[1]))))
39
- gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
40
- return gray, resized
41
- def parse_gemini_response(response_text):
42
- parsed_data = {
43
- "Manufacturing Date": "",
44
- "Expiry Date": "",
45
- "MRP Details": ""
46
- }
47
- for line in response_text.split("\n"):
48
- if line.startswith("Manufacturing Date:"):
49
- parsed_data["Manufacturing Date"] = line.split("Manufacturing Date:")[1].strip()
50
- elif line.startswith("Expiry Date:"):
51
- parsed_data["Expiry Date"] = line.split("Expiry Date:")[1].strip()
52
- elif line.startswith("MRP Details:"):
53
- parsed_data["MRP Details"] = line.split("MRP Details:")[1].strip()
54
- return parsed_data
55
-
56
- # Function to call Gemini LLM for date predictions
57
- def call_gemini_llm_for_dates(text):
58
- # Use the previously set up Gemini model for predictions
59
- model = genai.GenerativeModel('models/gemini-1.5-flash')
60
- prompt = f"""
61
- You are provided with extracted words from a product's packaging. Based on this text, your task is to predict the manufacturing and expiry dates of the product.
62
-
63
- Please follow these rules:
64
- - If only one date is present, consider it to be the expiry date.
65
- - Ignore any noise or irrelevant information.
66
- - Predict the most logical manufacturing and expiry dates based on the context provided.
67
- - Output the dates strictly in the format:
68
- Manufacturing Date: DD/MM/YYYY
69
- Expiry Date: DD/MM/YYYY
70
- - Do not generate any other information or text besides the two dates.
71
-
72
- Here is the extracted text:
73
- {text}
74
- """
75
-
76
- # Send the prompt to Gemini model and get the response
77
- response = model.generate_content(prompt)
78
- print(response.text)
79
-
80
- return response.text.strip()
81
-
82
-
83
-
84
- # Gradio function to process the video
85
- def gradio_video_ocr_processing(video_file):
86
- input_video_path = video_file
87
- output_video_path = "annotated_video.mp4"
88
- output_text_file = "detected_words.csv"
89
-
90
- print("[DEBUG] Starting video processing.")
91
- cap = cv2.VideoCapture(input_video_path)
92
- if not cap.isOpened():
93
- print("[ERROR] Cannot open video file.")
94
- return None, "Error: Cannot open video file."
95
-
96
- input_frame_rate = cap.get(cv2.CAP_PROP_FPS)
97
- print(f"[DEBUG] Input video frame rate: {input_frame_rate} FPS.")
98
-
99
- fourcc = cv2.VideoWriter_fourcc(*'mp4v')
100
- out = None
101
- frame_skip = 2
102
- resize_width = 600
103
- detected_words = [["Frame", "Word", "Confidence", "X", "Y", "Width", "Height"]]
104
- frame_count = 0
105
-
106
- while cap.isOpened():
107
- ret, frame = cap.read()
108
- if not ret:
109
- print("[DEBUG] End of video stream.")
110
- break
111
-
112
- if frame_count % frame_skip != 0:
113
- frame_count += 1
114
- continue
115
-
116
- # Preprocess frame
117
- gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
118
- resized_frame = cv2.resize(gray, (resize_width, int(frame.shape[0] * resize_width / frame.shape[1])))
119
- print(f"[DEBUG] Processing frame {frame_count}.")
120
-
121
- # OCR processing with PaddleOCR
122
- # OCR processing with PaddleOCR
123
- results = ocr.ocr(resized_frame)
124
- if results[0] is not None:
125
- for line in results[0]:
126
- word, confidence = line[1][0], float(line[1][1])
127
- if confidence > 0.7:
128
- bbox = line[0]
129
- x_min, y_min = int(bbox[0][0]), int(bbox[0][1])
130
- x_max, y_max = int(bbox[2][0]), int(bbox[2][1])
131
- detected_words.append([frame_count, word, confidence, x_min, y_min, x_max - x_min, y_max - y_min])
132
-
133
- # Annotate the frame
134
- frame = cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
135
- frame = cv2.putText(frame, f"{word} ({confidence:.2f})", (x_min, y_min - 10),
136
- cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
137
- else:
138
- print(f"[DEBUG] No text detected in frame {frame_count}.")
139
-
140
- frame = add_branding(frame)
141
- if out is None:
142
- out = cv2.VideoWriter(output_video_path, fourcc, input_frame_rate // frame_skip,
143
- (frame.shape[1], frame.shape[0]))
144
- out.write(frame)
145
- frame_count += 1
146
-
147
- cap.release()
148
- if out is not None:
149
- out.release()
150
- cv2.destroyAllWindows()
151
-
152
- # Save detected words to CSV
153
- with open(output_text_file, 'w', newline='', encoding='utf-8') as file:
154
- writer = csv.writer(file)
155
- writer.writerows(detected_words)
156
- print(f"[INFO] Detected words saved to {output_text_file}.")
157
- print(f"[INFO] Annotated video saved to {output_video_path}.")
158
-
159
- # Generate Gemini response
160
- ocr_results_df = pd.read_csv(output_text_file)
161
- detected_text = " ".join(ocr_results_df['Word'].dropna())
162
- gemini_response = call_gemini_llm_for_dates(detected_text)
163
- parsed_output = parse_gemini_response(gemini_response)
164
-
165
- print("[DEBUG] Gemini response generated.")
166
- return output_video_path, gemini_response, parsed_output
167
-
168
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+
4
+ def frame_similarity_detection(video_path, threshold=350*1E4, scale_factor=0.45, output_video_path="non_similar_frames_output.mp4"):
5
+ # Open the video file
6
+ cap = cv2.VideoCapture(video_path)
7
+
8
+ # Get the total number of frames in the video
9
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
10
+ print(f"Total number of frames in the video: {total_frames}")
11
+
12
+ # Initialize variables
13
+ prev_frame = None
14
+ frame_count = 0
15
+ non_similar_frames = []
16
+ frame_list = [] # List to store frames that are non-similar
17
+
18
+ # Open the output video file
19
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for mp4 format
20
+ frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
21
+ frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
22
+
23
+ # Resize the frame dimensions
24
+ resized_width = 640
25
+ resized_height = 480
26
+
27
+ out = cv2.VideoWriter(output_video_path, fourcc, cap.get(cv2.CAP_PROP_FPS), (resized_width, resized_height))
28
+
29
+ # To store the first frame in case all frames are similar
30
+ first_frame = None
31
+
32
+ while True:
33
+ ret, frame = cap.read()
34
+ if not ret:
35
+ break # Break the loop if no more frames are available
36
+
37
+ frame_count += 1
38
+
39
+ # Resize the frame to reduce resolution
40
+ resized_frame = cv2.resize(frame, (resized_width, resized_height))
41
+
42
+ # Save the first frame to be used later if needed
43
+ if frame_count == 1:
44
+ first_frame = resized_frame
45
+
46
+ # Convert frame to grayscale (for faster processing)
47
+ gray_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2GRAY)
48
+
49
+ if prev_frame is not None:
50
+ # Compute absolute difference between current and previous frame
51
+ frame_diff = cv2.absdiff(prev_frame, gray_frame)
52
+
53
+ # Calculate the sum of differences
54
+ diff_sum = np.sum(frame_diff)
55
+
56
+ # If the difference is above the threshold, consider frames as non-similar
57
+ if diff_sum > threshold:
58
+ non_similar_frames.append(frame_count) # Save the frame number
59
+ frame_list.append(resized_frame) # Store the non-similar frame in the list
60
+
61
+ # Set the current frame as the previous frame for the next iteration
62
+ prev_frame = gray_frame
63
+
64
+ # If no non-similar frames were detected, add the first frame
65
+ if not non_similar_frames and first_frame is not None:
66
+ frame_list.append(first_frame) # Add the first frame to the list
67
+
68
+ # Write the non-similar frames (or the single frame if no non-similar frames) to the output video file
69
+ for frame in frame_list:
70
+ out.write(frame)
71
+
72
+ # Release the video capture and writer objects
73
+ cap.release()
74
+ out.release()
75
+ cv2.destroyAllWindows() # Close all OpenCV windows
76
+
77
+ # Print the list of frames that are not similar
78
+ if non_similar_frames:
79
+ print(f"Frames not similar (above difference threshold of {threshold}): {non_similar_frames}")
80
+ print(f"Output video saved as: {output_video_path}")
81
+ else:
82
+ print(f"All frames are similar (below difference threshold of {threshold}). One frame has been included.")
83
+
84
+ print(f"Total non-similar frames: {len(non_similar_frames)}")
85
+
86
+ return non_similar_frames, output_video_path
87
+ # Import necessary libraries
88
+ import cv2
89
+ from paddleocr import PaddleOCR, draw_ocr
90
+ # import paddle
91
+ import os
92
+ import csv
93
+ import numpy as np
94
+ import gradio as gr
95
+ import google.generativeai as genai
96
+ import pandas as pd
97
+ # from google.colab import userdata
98
+
99
+ use_gpu = paddle.device.is_compiled_with_cuda() and paddle.device.get_device().startswith("gpu")
100
+
101
+ # Define paths
102
+ ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=use_gpu)
103
+
104
+ # GOOGLE_API_KEY = os.getenv("GEMINI_API")
105
+
106
+ GOOGLE_API_KEY = os.getenv("GEMINI_API")
107
+ genai.configure(api_key=GOOGLE_API_KEY)
108
+
109
+ # Function to add branding to a frame
110
+ def add_branding(frame, text="Abhinav Video OCR", position=(50, 50), font_scale=2, font_thickness=3,
111
+ text_color=(255, 255, 255), bg_color=(0, 0, 0)):
112
+ overlay = frame.copy()
113
+ alpha = 0.6 # Transparency factor
114
+
115
+ # Get the width and height of the text box
116
+ (text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness)
117
+ x, y = position
118
+
119
+ # Draw a rectangle and put the text on it
120
+ cv2.rectangle(overlay, (x, y + 10), (x + text_width, y - text_height - 10), bg_color, -1)
121
+ cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame)
122
+ cv2.putText(frame, text, position, cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness)
123
+
124
+ return frame
125
+
126
+ # Function to preprocess the frame for OCR
127
+ def preprocess_frame(frame, resize_width=600):
128
+ resized = cv2.resize(frame, (resize_width, int(frame.shape[0] * (resize_width / frame.shape[1]))))
129
+ gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
130
+ return gray, resized
131
+ def parse_gemini_response(response_text):
132
+ parsed_data = {
133
+ "Manufacturing Date": "",
134
+ "Expiry Date": "",
135
+ "MRP Details": ""
136
+ }
137
+ for line in response_text.split("\n"):
138
+ if line.startswith("Manufacturing Date:"):
139
+ parsed_data["Manufacturing Date"] = line.split("Manufacturing Date:")[1].strip()
140
+ elif line.startswith("Expiry Date:"):
141
+ parsed_data["Expiry Date"] = line.split("Expiry Date:")[1].strip()
142
+ elif line.startswith("MRP Details:"):
143
+ parsed_data["MRP Details"] = line.split("MRP Details:")[1].strip()
144
+ return parsed_data
145
+
146
+ # Function to call Gemini LLM for date predictions
147
+ def call_gemini_llm_for_dates(text):
148
+ # Use the previously set up Gemini model for predictions
149
+ model = genai.GenerativeModel('models/gemini-1.5-flash')
150
+ prompt = f"""
151
+ You are provided with extracted words from a product's packaging. Based on this text, your task is to predict the manufacturing and expiry dates of the product.
152
+
153
+ Please follow these rules:
154
+ - If only one date is present, consider it to be the expiry date.
155
+ - If the dates are detected as only Month and Year, provide them in the format MM/YYYY.
156
+ - Ignore any noise or irrelevant information.
157
+ - Predict the most logical manufacturing and expiry dates based on the context provided.
158
+ - Output the dates strictly in the format:
159
+ Manufacturing Date: DD/MM/YYYY or MM/YYYY
160
+ Expiry Date: DD/MM/YYYY or MM/YYYY
161
+ - Do not generate any other information or text besides the two dates.
162
+
163
+ Here is the extracted text:
164
+ {text}
165
+ """
166
+
167
+
168
+ # Send the prompt to Gemini model and get the response
169
+ response = model.generate_content(prompt)
170
+ print(response.text)
171
+
172
+ return response.text.strip()
173
+
174
+
175
+
176
+ def gradio_video_ocr_processing(video_file):
177
+ input_video_path = video_file
178
+ output_video_path = "annotated_video.mp4"
179
+ output_text_file = "detected_words.csv"
180
+
181
+ print("[DEBUG] Starting video processing.")
182
+
183
+ # Step 1: Frame similarity detection
184
+ print("[DEBUG] Detecting non-similar frames.")
185
+ non_similar_frames, frame_diff_video_path = frame_similarity_detection(input_video_path)
186
+ # if len(non_similar_frames) > 100:
187
+ # non_similar_frames = non_similar_frames[::2] # Select every alternate frame
188
+ # Step 2: OCR processing and saving the results
189
+ cap = cv2.VideoCapture(input_video_path)
190
+ if not cap.isOpened():
191
+ print("[ERROR] Cannot open video file.")
192
+ return None, "Error: Cannot open video file."
193
+
194
+ input_frame_rate = cap.get(cv2.CAP_PROP_FPS)
195
+ print(f"[DEBUG] Input video frame rate: {input_frame_rate} FPS.")
196
+
197
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
198
+ out = None
199
+
200
+ frame_skip = 2
201
+ resize_width = 600
202
+ detected_words = [["Frame", "Word", "Confidence", "X", "Y", "Width", "Height"]]
203
+ frame_count = 0
204
+
205
+ while cap.isOpened():
206
+ ret, frame = cap.read()
207
+ if not ret:
208
+ print("[DEBUG] End of video stream.")
209
+ break
210
+
211
+ # Only process non-similar frames
212
+ if frame_count not in non_similar_frames:
213
+ frame_count += 1
214
+ continue # Skip similar frames
215
+
216
+ # Preprocess frame
217
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
218
+ resized_frame = cv2.resize(gray, (resize_width, int(frame.shape[0] * resize_width / frame.shape[1])))
219
+
220
+ print(f"[DEBUG] Processing frame {frame_count}.")
221
+
222
+ # OCR processing with PaddleOCR
223
+ results = ocr.ocr(resized_frame)
224
+ if results[0] is not None:
225
+ for line in results[0]:
226
+ word, confidence = line[1][0], float(line[1][1])
227
+ if confidence > 0.7:
228
+ bbox = line[0]
229
+ x_min, y_min = int(bbox[0][0]), int(bbox[0][1])
230
+ x_max, y_max = int(bbox[2][0]), int(bbox[2][1])
231
+ detected_words.append([frame_count, word, confidence, x_min, y_min, x_max - x_min, y_max - y_min])
232
+
233
+ # Annotate the frame
234
+ frame = cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
235
+ frame = cv2.putText(frame, f"{word} ({confidence:.2f})", (x_min, y_min - 10),
236
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
237
+ else:
238
+ print(f"[DEBUG] No text detected in frame {frame_count}.")
239
+
240
+ frame = add_branding(frame)
241
+ if out is None:
242
+ out = cv2.VideoWriter(output_video_path, fourcc, input_frame_rate,
243
+ (frame.shape[1], frame.shape[0]))
244
+ out.write(frame)
245
+ frame_count += 1
246
+
247
+ cap.release()
248
+ if out is not None:
249
+ out.release()
250
+ cv2.destroyAllWindows()
251
+
252
+ # Save detected words to CSV
253
+ with open(output_text_file, 'w', newline='', encoding='utf-8') as file:
254
+ writer = csv.writer(file)
255
+ writer.writerows(detected_words)
256
+ print(f"[INFO] Detected words saved to {output_text_file}.")
257
+ print(f"[INFO] Annotated video saved to {output_video_path}.")
258
+
259
+ # Generate Gemini response
260
+ ocr_results_df = pd.read_csv(output_text_file)
261
+ ocr_results_df_clean = ocr_results_df.drop_duplicates(subset='Word', keep='first') # Clean the duplicates in "Word" column
262
+
263
+ detected_text = " ".join(ocr_results_df_clean['Word'].dropna())
264
+ gemini_response = call_gemini_llm_for_dates(detected_text)
265
+ parsed_output = parse_gemini_response(gemini_response)
266
+
267
+ print("[DEBUG] Gemini response generated.")
268
+ return output_video_path, gemini_response, parsed_output
269
+
270
+
271
+