Spaces:
Runtime error
Runtime error
fix_doc
Browse files- __pycache__/recite_module.cpython-310.pyc +0 -0
- app.py +1 -1
- recite_module.py +11 -14
__pycache__/recite_module.cpython-310.pyc
ADDED
|
Binary file (8.75 kB). View file
|
|
|
app.py
CHANGED
|
@@ -13,7 +13,7 @@ demo2 = gr.Interface(
|
|
| 13 |
run,
|
| 14 |
[gr.Audio(sources=["upload"]), gr.Image(
|
| 15 |
type="filepath", label="Image")],
|
| 16 |
-
gr.Image(type="pil", label="output Image")
|
| 17 |
)
|
| 18 |
with demo:
|
| 19 |
gr.TabbedInterface([demo1, demo2],
|
|
|
|
| 13 |
run,
|
| 14 |
[gr.Audio(sources=["upload"]), gr.Image(
|
| 15 |
type="filepath", label="Image")],
|
| 16 |
+
[gr.Image(type="pil", label="output Image")]
|
| 17 |
)
|
| 18 |
with demo:
|
| 19 |
gr.TabbedInterface([demo1, demo2],
|
recite_module.py
CHANGED
|
@@ -6,6 +6,8 @@ import cv2
|
|
| 6 |
from PIL import Image
|
| 7 |
from evaluate import load
|
| 8 |
import librosa
|
|
|
|
|
|
|
| 9 |
|
| 10 |
asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")
|
| 11 |
wer = load("wer")
|
|
@@ -21,8 +23,6 @@ def extract_text(image):
|
|
| 21 |
Raises:
|
| 22 |
ValueError: If the input image is not a PIL Image object.
|
| 23 |
"""
|
| 24 |
-
if not isinstance(image, Image.Image):
|
| 25 |
-
raise ValueError("Invalid input. Image should be a PIL Image object.")
|
| 26 |
|
| 27 |
result = pytesseract.image_to_data(image, output_type='dict')
|
| 28 |
n_boxes = len(result['level'])
|
|
@@ -57,8 +57,6 @@ def draw_rectangle(image, x, y, w, h, color=(0, 0, 255), thickness=2):
|
|
| 57 |
Raises:
|
| 58 |
ValueError: If the input image is not a PIL Image object.
|
| 59 |
"""
|
| 60 |
-
if not isinstance(image, Image.Image):
|
| 61 |
-
raise ValueError("Invalid input. Image should be a PIL Image object.")
|
| 62 |
|
| 63 |
image_array = np.array(image)
|
| 64 |
image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
|
|
@@ -93,7 +91,7 @@ def transcribe(audio):
|
|
| 93 |
y /= np.max(np.abs(y))
|
| 94 |
|
| 95 |
transcribed_text = asr(
|
| 96 |
-
{"sampling_rate": sr, "raw": y}
|
| 97 |
|
| 98 |
return transcribed_text
|
| 99 |
|
|
@@ -137,7 +135,11 @@ def match(refence, spoken):
|
|
| 137 |
|
| 138 |
if spoken == "":
|
| 139 |
return 0
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
score = 1 - wer_score
|
| 142 |
return score
|
| 143 |
|
|
@@ -199,9 +201,6 @@ def process_image(im, data):
|
|
| 199 |
Raises:
|
| 200 |
ValueError: If the input image is not a PIL Image object or if the data is not a dictionary.
|
| 201 |
"""
|
| 202 |
-
if not isinstance(im, Image.Image) or not isinstance(data, dict):
|
| 203 |
-
raise ValueError(
|
| 204 |
-
"Invalid input. Image should be a PIL Image object and data should be a dictionary.")
|
| 205 |
|
| 206 |
im_array = np.array(im)
|
| 207 |
hg, wg, _ = im_array.shape
|
|
@@ -244,9 +243,6 @@ def run(stream, image):
|
|
| 244 |
raise ValueError(
|
| 245 |
"Invalid input. Stream should be either a file path or a tuple of (sampling_rate, raw_audio).")
|
| 246 |
|
| 247 |
-
if not isinstance(image, Image.Image):
|
| 248 |
-
raise ValueError("Invalid input. Image should be a PIL Image object.")
|
| 249 |
-
|
| 250 |
data = extract_text(image)
|
| 251 |
im_text_ = [data[i]["text"] for i in range(len(data))]
|
| 252 |
im_text = " ".join(im_text_)
|
|
@@ -255,10 +251,11 @@ def run(stream, image):
|
|
| 255 |
im_array = np.array(Image.open(image))
|
| 256 |
data2 = None
|
| 257 |
for i in range(len(chunks)):
|
| 258 |
-
|
|
|
|
| 259 |
data2 = reindex_data(data, index[i], l)
|
| 260 |
break
|
| 261 |
if data2 is not None:
|
| 262 |
return process_image(im_array, data2)
|
| 263 |
else:
|
| 264 |
-
return im_array
|
|
|
|
| 6 |
from PIL import Image
|
| 7 |
from evaluate import load
|
| 8 |
import librosa
|
| 9 |
+
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
|
| 10 |
+
|
| 11 |
|
| 12 |
asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")
|
| 13 |
wer = load("wer")
|
|
|
|
| 23 |
Raises:
|
| 24 |
ValueError: If the input image is not a PIL Image object.
|
| 25 |
"""
|
|
|
|
|
|
|
| 26 |
|
| 27 |
result = pytesseract.image_to_data(image, output_type='dict')
|
| 28 |
n_boxes = len(result['level'])
|
|
|
|
| 57 |
Raises:
|
| 58 |
ValueError: If the input image is not a PIL Image object.
|
| 59 |
"""
|
|
|
|
|
|
|
| 60 |
|
| 61 |
image_array = np.array(image)
|
| 62 |
image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
|
|
|
|
| 91 |
y /= np.max(np.abs(y))
|
| 92 |
|
| 93 |
transcribed_text = asr(
|
| 94 |
+
{"sampling_rate": sr, "raw": y})["text"]
|
| 95 |
|
| 96 |
return transcribed_text
|
| 97 |
|
|
|
|
| 135 |
|
| 136 |
if spoken == "":
|
| 137 |
return 0
|
| 138 |
+
normalizer = BasicTextNormalizer()
|
| 139 |
+
spoken = clean_transcription(spoken)
|
| 140 |
+
predection = normalizer(spoken)
|
| 141 |
+
refence = normalizer(refence)
|
| 142 |
+
wer_score = wer.compute(references=[refence], predictions=[predection])
|
| 143 |
score = 1 - wer_score
|
| 144 |
return score
|
| 145 |
|
|
|
|
| 201 |
Raises:
|
| 202 |
ValueError: If the input image is not a PIL Image object or if the data is not a dictionary.
|
| 203 |
"""
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
im_array = np.array(im)
|
| 206 |
hg, wg, _ = im_array.shape
|
|
|
|
| 243 |
raise ValueError(
|
| 244 |
"Invalid input. Stream should be either a file path or a tuple of (sampling_rate, raw_audio).")
|
| 245 |
|
|
|
|
|
|
|
|
|
|
| 246 |
data = extract_text(image)
|
| 247 |
im_text_ = [data[i]["text"] for i in range(len(data))]
|
| 248 |
im_text = " ".join(im_text_)
|
|
|
|
| 251 |
im_array = np.array(Image.open(image))
|
| 252 |
data2 = None
|
| 253 |
for i in range(len(chunks)):
|
| 254 |
+
print(match(chunks[i], trns_text))
|
| 255 |
+
if match(chunks[i], trns_text) >= 0.10:
|
| 256 |
data2 = reindex_data(data, index[i], l)
|
| 257 |
break
|
| 258 |
if data2 is not None:
|
| 259 |
return process_image(im_array, data2)
|
| 260 |
else:
|
| 261 |
+
return im_array
|