Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from transformers import pipeline | |
| import numpy as np | |
| import pytesseract | |
| import cv2 | |
| from PIL import Image | |
| from evaluate import load | |
| import librosa | |
| from transformers.models.whisper.english_normalizer import BasicTextNormalizer | |
| asr = pipeline("automatic-speech-recognition", model="openai/whisper-base") | |
| wer = load("wer") | |
| def extract_text(image): | |
| """ | |
| Extracts text from an image using OCR. | |
| Args: | |
| image (PIL.Image.Image): Input image. | |
| Returns: | |
| dict: Extracted text with confidence and coordinates. | |
| Raises: | |
| ValueError: If the input image is not a PIL Image object. | |
| """ | |
| result = pytesseract.image_to_data(image, output_type='dict') | |
| n_boxes = len(result['level']) | |
| data = {} | |
| k = 0 | |
| for i in range(n_boxes): | |
| if result['conf'][i] >= 0.3 and result['text'][i] != '' and result['conf'][i] != -1: | |
| data[k] = {} | |
| (x, y, w, h) = (result['left'][i], result['top'] | |
| [i], result['width'][i], result['height'][i]) | |
| data[k]["coordinates"] = (x, y, w, h) | |
| text, conf = result['text'][k], result['conf'][k] | |
| data[k]["text"] = text | |
| data[k]["conf"] = conf | |
| k += 1 | |
| return data | |
| def draw_rectangle(image, x, y, w, h, color=(0, 0, 255), thickness=2): | |
| """ | |
| Draws a rectangle on the given image. | |
| Args: | |
| image (PIL.Image.Image): Input image. | |
| x (int): x-coordinate of the top-left corner of the rectangle. | |
| y (int): y-coordinate of the top-left corner of the rectangle. | |
| w (int): Width of the rectangle. | |
| h (int): Height of the rectangle. | |
| color (tuple, optional): Color of the rectangle in RGB format. | |
| thickness (int, optional): Thickness of the rectangle's border. | |
| Returns: | |
| PIL.Image.Image: Image with the rectangle drawn on it. | |
| Raises: | |
| ValueError: If the input image is not a PIL Image object. | |
| """ | |
| image_array = np.array(image) | |
| image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR) | |
| cv2.rectangle(image_array, (x, y), (x + w, y + h), color, thickness) | |
| return Image.fromarray(cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB)) | |
| def transcribe(audio): | |
| """ | |
| Transcribes audio into text using ASR. | |
| Parameters: | |
| audio (str or tuple): Audio source. | |
| Returns: | |
| str: Transcribed text. | |
| Raises: | |
| ValueError: If the input audio is not valid. | |
| """ | |
| if not isinstance(audio, (str, tuple)): | |
| raise ValueError( | |
| "Invalid input. Audio should be either a file path or a tuple of (sampling_rate, raw_audio).") | |
| if isinstance(audio, str): # If audio is a file path | |
| y, sr = librosa.load(audio) | |
| # If audio is (sampling_rate, raw_audio) | |
| elif isinstance(audio, tuple) and len(audio) == 2: | |
| sr, y = audio | |
| y = y.astype(np.float32) | |
| else: | |
| raise ValueError( | |
| "Invalid input. Audio should be a file path or a tuple of (sampling_rate, raw_audio).") | |
| y /= np.max(np.abs(y)) | |
| transcribed_text = asr( | |
| {"sampling_rate": sr, "raw": y})["text"] | |
| return transcribed_text | |
| def clean_transcription(transcription): | |
| """ | |
| Cleans the transcription by removing consecutive duplicate words. | |
| Args: | |
| transcription (str): Input transcription. | |
| Returns: | |
| str: Cleaned transcription. | |
| Raises: | |
| ValueError: If the input transcription is not a string. | |
| """ | |
| if not isinstance(transcription, str): | |
| raise ValueError("Invalid input. Transcription should be a string.") | |
| text = transcription.lower() | |
| words = text.split() | |
| cleaned_words = [words[0]] | |
| for word in words[1:]: | |
| if word != cleaned_words[-1]: | |
| cleaned_words.append(word) | |
| return ' '.join(cleaned_words) | |
| def match(refence, spoken): | |
| """ | |
| Calculates the match score between a reference and spoken string. | |
| Args: | |
| reference (str): Reference string. | |
| spoken (str): Spoken string. | |
| Returns: | |
| float: Match score between 0 and 1. | |
| Raises: | |
| ValueError: If either reference or spoken is not a string. | |
| """ | |
| if not isinstance(refence, str) or not isinstance(spoken, str): | |
| raise ValueError( | |
| "Invalid input. Reference and spoken should be strings.") | |
| if spoken == "": | |
| return 0 | |
| normalizer = BasicTextNormalizer() | |
| spoken = clean_transcription(spoken) | |
| predection = normalizer(spoken) | |
| refence = normalizer(refence) | |
| wer_score = wer.compute(references=[refence], predictions=[predection]) | |
| score = 1 - wer_score | |
| return score | |
| def split_to_l(text, answer): | |
| """ | |
| Splits the given text into chunks of length 'l' based on the answer. | |
| Args: | |
| text (str): The input text to be split. | |
| answer (str): The answer used to determine the chunk size. | |
| Returns: | |
| tuple: A tuple containing the chunks of text, the indices of the chunks, and the length of each chunk. | |
| """ | |
| if not isinstance(text, str) or not isinstance(answer, str): | |
| raise ValueError("Invalid input. Text and answer should be strings.") | |
| l = len(answer.split(" ")) | |
| text_words = text.split(" ") | |
| chunks = [] | |
| indices = [] | |
| for i in range(0, len(text_words), l): | |
| chunk = " ".join(text_words[i: i + l]) | |
| chunks.append(chunk) | |
| indices.append(i) | |
| return chunks, indices, l | |
| def reindex_data(data, index, l): | |
| """ | |
| Reindexes a dictionary with keys ranging from 0 to l-1. | |
| Args: | |
| data (dict): Original dictionary. | |
| index (int): Starting index for reindexing. | |
| l (int): Length of the reindexed dictionary. | |
| Returns: | |
| dict: Reindexed dictionary. | |
| Raises: | |
| ValueError: If the input data is not a dictionary, or if index or l are not integers. | |
| """ | |
| if not isinstance(data, dict) or not isinstance(index, int) or not isinstance(l, int): | |
| raise ValueError( | |
| "Invalid input. Data should be a dictionary, index and l should be integers.") | |
| reindexed_data = {} | |
| for i in range(l): | |
| original_index = index + i | |
| reindexed_data[i] = data[original_index] | |
| return reindexed_data | |
| def process_image(im, data): | |
| """ | |
| Processes an image by extracting text regions. | |
| Args: | |
| im (PIL.Image.Image): Input image. | |
| data (dict): Data containing information about text regions. | |
| Returns: | |
| numpy.ndarray: Processed image with text regions highlighted. | |
| Raises: | |
| ValueError: If the input image is not a PIL Image object or if the data is not a dictionary. | |
| """ | |
| im_array = np.array(im) | |
| hg, wg, _ = im_array.shape | |
| text_y = np.max([data[i]["coordinates"][1] | |
| for i in range(len(data))]) | |
| text_x = np.max([data[i]["coordinates"][0] | |
| for i in range(len(data))]) | |
| text_start_x = np.min([data[i]["coordinates"][0] | |
| for i in range(len(data))]) | |
| text_start_y = np.min([data[i]["coordinates"][1] | |
| for i in range(len(data))]) | |
| max_height = int(np.mean([data[i]["coordinates"][3] | |
| for i in range(len(data))])) | |
| max_width = int(np.mean([data[i]["coordinates"][2] | |
| for i in range(len(data))])) | |
| wall = np.zeros((hg, wg, 3), np.uint8) | |
| wall[text_start_y:text_y + max_height, text_start_x:text_x + max_width] = \ | |
| im_array[text_start_y:text_y + max_height, | |
| text_start_x:text_x + max_width, :] | |
| for i in range(1, len(data)): | |
| x, y, w, h = data[i]["coordinates"] | |
| wall = draw_rectangle(wall, x, y, w, h) | |
| return wall | |
| def run(stream, image): | |
| """ | |
| Processes an image and transcribes audio. | |
| Args: | |
| stream (str or tuple): Audio source. | |
| image (PIL.Image.Image): Input image. | |
| Returns: | |
| numpy.ndarray or PIL.Image.Image: Processed image data. | |
| Raises: | |
| ValueError: If the input stream is not a valid type or if the input image is not a PIL Image object. | |
| """ | |
| if not isinstance(stream, (str, tuple)): | |
| raise ValueError( | |
| "Invalid input. Stream should be either a file path or a tuple of (sampling_rate, raw_audio).") | |
| data = extract_text(image) | |
| im_text_ = [data[i]["text"] for i in range(len(data))] | |
| im_text = " ".join(im_text_) | |
| trns_text = transcribe(stream) | |
| chunks, index, l = split_to_l(im_text, trns_text) | |
| im_array = np.array(Image.open(image)) | |
| data2 = None | |
| for i in range(len(chunks)): | |
| if match(chunks[i], trns_text) >= 0.10: | |
| data2 = reindex_data(data, index[i], l) | |
| break | |
| if data2 is not None: | |
| return process_image(im_array, data2) | |
| else: | |
| return im_array | |