Spaces:
Runtime error
Runtime error
tonic
commited on
Commit
·
5701b30
1
Parent(s):
e115e9b
improve the interface , add parsing for longest phrases , language code quick fix for surya, adding translation with aya
Browse files- app.py +93 -26
- languages.json +0 -0
app.py
CHANGED
|
@@ -5,7 +5,7 @@ from surya.ocr import run_ocr
|
|
| 5 |
from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
|
| 6 |
from surya.model.recognition.model import load_model as load_rec_model
|
| 7 |
from surya.model.recognition.processor import load_processor as load_rec_processor
|
| 8 |
-
|
| 9 |
from gradio_client import Client
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
import requests
|
|
@@ -27,7 +27,7 @@ choices = df["name"].to_list()
|
|
| 27 |
inputlanguage = ""
|
| 28 |
producetext = "\n\nProduce a complete expositional blog post in {target_language} based on the above :"
|
| 29 |
formatinputstring = "\n\nthe above text is a learning aid. you must use rich text format to rewrite the above and add 1 . a red color tags for nouns 2. a blue color tag for verbs 3. a green color tag for adjectives and adverbs:"
|
| 30 |
-
|
| 31 |
# Regular expression patterns for each color
|
| 32 |
patterns = {
|
| 33 |
"red": r'<span style="color: red;">(.*?)</span>',
|
|
@@ -41,6 +41,35 @@ matches = {
|
|
| 41 |
"blue": [],
|
| 42 |
"green": [],
|
| 43 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
class TaggedPhraseExtractor:
|
| 45 |
def __init__(self, text=''):
|
| 46 |
self.text = text
|
|
@@ -55,24 +84,32 @@ class TaggedPhraseExtractor:
|
|
| 55 |
self.patterns[color] = pattern
|
| 56 |
|
| 57 |
def extract_phrases(self):
|
| 58 |
-
"""Extract phrases for all colors and patterns added."""
|
| 59 |
-
matches = {
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
def print_phrases(self):
|
| 63 |
-
"""Extract phrases and print them."""
|
| 64 |
matches = self.extract_phrases()
|
| 65 |
-
for color,
|
| 66 |
print(f"Phrases with color {color}:")
|
| 67 |
-
for phrase in
|
|
|
|
|
|
|
|
|
|
| 68 |
print(f"- {phrase}")
|
| 69 |
-
print()
|
| 70 |
|
| 71 |
-
|
| 72 |
-
audio_client = Client(SEAMLESSM4T)
|
| 73 |
-
client = Client(SEAMLESSM4T)
|
| 74 |
-
|
| 75 |
-
def process_audio_to_text(audio_path, inputlanguage="English"):
|
| 76 |
"""
|
| 77 |
Convert audio input to text using the Gradio client.
|
| 78 |
"""
|
|
@@ -80,7 +117,7 @@ def process_audio_to_text(audio_path, inputlanguage="English"):
|
|
| 80 |
result = audio_client.predict(
|
| 81 |
audio_path,
|
| 82 |
inputlanguage,
|
| 83 |
-
|
| 84 |
api_name="/s2tt"
|
| 85 |
)
|
| 86 |
print("Audio Result: ", result)
|
|
@@ -100,8 +137,8 @@ def process_text_to_audio(text, translatefrom="English", translateto="English"):
|
|
| 100 |
return result[0]
|
| 101 |
|
| 102 |
class OCRProcessor:
|
| 103 |
-
def __init__(self,
|
| 104 |
-
self.
|
| 105 |
self.det_processor, self.det_model = load_det_processor(), load_det_model()
|
| 106 |
self.rec_model, self.rec_processor = load_rec_model(), load_rec_processor()
|
| 107 |
|
|
@@ -109,18 +146,19 @@ class OCRProcessor:
|
|
| 109 |
"""
|
| 110 |
Process a PIL image and return the OCR text.
|
| 111 |
"""
|
| 112 |
-
predictions = run_ocr([image], [self.
|
| 113 |
-
return predictions[0]
|
| 114 |
|
| 115 |
def process_pdf(self, pdf_path):
|
| 116 |
"""
|
| 117 |
Process a PDF file and return the OCR text.
|
| 118 |
"""
|
| 119 |
-
predictions = run_ocr([pdf_path], [self.
|
| 120 |
-
return predictions[0]
|
| 121 |
|
| 122 |
def process_input(image=None, file=None, audio=None, text="", translateto = "English", translatefrom = "English" ):
|
| 123 |
-
|
|
|
|
| 124 |
final_text = text
|
| 125 |
if image is not None:
|
| 126 |
ocr_prediction = ocr_processor.process_image(image)
|
|
@@ -171,7 +209,20 @@ def process_input(image=None, file=None, audio=None, text="", translateto = "Eng
|
|
| 171 |
|
| 172 |
audio_output = process_text_to_audio(processed_text, translateto, translateto)
|
| 173 |
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
def main():
|
| 177 |
with gr.Blocks() as demo:
|
|
@@ -193,12 +244,28 @@ def main():
|
|
| 193 |
process_button = gr.Button("🌟AyaTonic")
|
| 194 |
|
| 195 |
processed_text_output = RichTextbox(label="Processed Text")
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
process_button.click(
|
| 199 |
-
fn=
|
| 200 |
inputs=[image_input, file_input, audio_input, text_input, input_language, target_language],
|
| 201 |
-
outputs=[processed_text_output,
|
| 202 |
)
|
| 203 |
|
| 204 |
if __name__ == "__main__":
|
|
|
|
| 5 |
from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor
|
| 6 |
from surya.model.recognition.model import load_model as load_rec_model
|
| 7 |
from surya.model.recognition.processor import load_processor as load_rec_processor
|
| 8 |
+
from lang_list import TEXT_SOURCE_LANGUAGE_NAMES , LANGUAGE_NAME_TO_CODE , text_source_language_codes
|
| 9 |
from gradio_client import Client
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
import requests
|
|
|
|
| 27 |
inputlanguage = ""
|
| 28 |
producetext = "\n\nProduce a complete expositional blog post in {target_language} based on the above :"
|
| 29 |
formatinputstring = "\n\nthe above text is a learning aid. you must use rich text format to rewrite the above and add 1 . a red color tags for nouns 2. a blue color tag for verbs 3. a green color tag for adjectives and adverbs:"
|
| 30 |
+
translatetextinst = "\n\nthe above text is a learning aid. you must use markdown format to translate the above into {inputlanguage} :'"
|
| 31 |
# Regular expression patterns for each color
|
| 32 |
patterns = {
|
| 33 |
"red": r'<span style="color: red;">(.*?)</span>',
|
|
|
|
| 41 |
"blue": [],
|
| 42 |
"green": [],
|
| 43 |
}
|
| 44 |
+
|
| 45 |
+
co = cohere.Client(COHERE_API_KEY)
|
| 46 |
+
audio_client = Client(SEAMLESSM4T)
|
| 47 |
+
|
| 48 |
+
def get_language_code(language_name):
|
| 49 |
+
"""
|
| 50 |
+
Extracts the first two letters of the language code based on the language name.
|
| 51 |
+
"""
|
| 52 |
+
code = df.loc[df['name'] == language_name, 'code'].values[0]
|
| 53 |
+
return code[:2]
|
| 54 |
+
|
| 55 |
+
def translate_text(text, instructions=translatetextinst):
|
| 56 |
+
"""
|
| 57 |
+
- text (str): The initial text.
|
| 58 |
+
Returns:
|
| 59 |
+
- str: The translated text response.
|
| 60 |
+
"""
|
| 61 |
+
prompt = f"{text}{instructions}"
|
| 62 |
+
response = co.generate(
|
| 63 |
+
model='c4ai-aya',
|
| 64 |
+
prompt=prompt,
|
| 65 |
+
max_tokens=2986,
|
| 66 |
+
temperature=0.6,
|
| 67 |
+
k=0,
|
| 68 |
+
stop_sequences=[],
|
| 69 |
+
return_likelihoods='NONE'
|
| 70 |
+
)
|
| 71 |
+
return response.generations[0].text
|
| 72 |
+
|
| 73 |
class TaggedPhraseExtractor:
|
| 74 |
def __init__(self, text=''):
|
| 75 |
self.text = text
|
|
|
|
| 84 |
self.patterns[color] = pattern
|
| 85 |
|
| 86 |
def extract_phrases(self):
|
| 87 |
+
"""Extract phrases for all colors and patterns added, including the three longest phrases."""
|
| 88 |
+
matches = {}
|
| 89 |
+
three_matches = {}
|
| 90 |
+
for color, pattern in self.patterns.items():
|
| 91 |
+
found_phrases = re.findall(pattern, self.text)
|
| 92 |
+
sorted_phrases = sorted(found_phrases, key=len, reverse=True)
|
| 93 |
+
matches[color] = {
|
| 94 |
+
'all_phrases': found_phrases,
|
| 95 |
+
'top_three_longest': sorted_phrases[:3]
|
| 96 |
+
}
|
| 97 |
+
three_matches = sorted_phrases[:3]
|
| 98 |
+
return matches , three_matches
|
| 99 |
|
| 100 |
def print_phrases(self):
|
| 101 |
+
"""Extract phrases and print them, including the three longest phrases."""
|
| 102 |
matches = self.extract_phrases()
|
| 103 |
+
for color, data in matches.items():
|
| 104 |
print(f"Phrases with color {color}:")
|
| 105 |
+
for phrase in data['all_phrases']:
|
| 106 |
+
print(f"- {phrase}")
|
| 107 |
+
print(f"\nThree longest phrases for color {color}:")
|
| 108 |
+
for phrase in data['top_three_longest']:
|
| 109 |
print(f"- {phrase}")
|
| 110 |
+
print()
|
| 111 |
|
| 112 |
+
def process_audio_to_text(audio_path, inputlanguage="English", outputlanguage="English"):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
"""
|
| 114 |
Convert audio input to text using the Gradio client.
|
| 115 |
"""
|
|
|
|
| 117 |
result = audio_client.predict(
|
| 118 |
audio_path,
|
| 119 |
inputlanguage,
|
| 120 |
+
outputlanguage,
|
| 121 |
api_name="/s2tt"
|
| 122 |
)
|
| 123 |
print("Audio Result: ", result)
|
|
|
|
| 137 |
return result[0]
|
| 138 |
|
| 139 |
class OCRProcessor:
|
| 140 |
+
def __init__(self, lang_code=["en"]):
|
| 141 |
+
self.lang_code = lang_code
|
| 142 |
self.det_processor, self.det_model = load_det_processor(), load_det_model()
|
| 143 |
self.rec_model, self.rec_processor = load_rec_model(), load_rec_processor()
|
| 144 |
|
|
|
|
| 146 |
"""
|
| 147 |
Process a PIL image and return the OCR text.
|
| 148 |
"""
|
| 149 |
+
predictions = run_ocr([image], [self.lang_code], self.det_model, self.det_processor, self.rec_model, self.rec_processor)
|
| 150 |
+
return predictions[0]
|
| 151 |
|
| 152 |
def process_pdf(self, pdf_path):
|
| 153 |
"""
|
| 154 |
Process a PDF file and return the OCR text.
|
| 155 |
"""
|
| 156 |
+
predictions = run_ocr([pdf_path], [self.lang_code], self.det_model, self.det_processor, self.rec_model, self.rec_processor)
|
| 157 |
+
return predictions[0]
|
| 158 |
|
| 159 |
def process_input(image=None, file=None, audio=None, text="", translateto = "English", translatefrom = "English" ):
|
| 160 |
+
lang_code = get_language_code(translatefrom)
|
| 161 |
+
ocr_processor = OCRProcessor(lang_code)
|
| 162 |
final_text = text
|
| 163 |
if image is not None:
|
| 164 |
ocr_prediction = ocr_processor.process_image(image)
|
|
|
|
| 209 |
|
| 210 |
audio_output = process_text_to_audio(processed_text, translateto, translateto)
|
| 211 |
|
| 212 |
+
extractor = TaggedPhraseExtractor(processed_text)
|
| 213 |
+
longest_phrases = extractor.get_longest_phrases()
|
| 214 |
+
|
| 215 |
+
# Translate the longest phrases back into the native language
|
| 216 |
+
translated_phrases = [translate_text(phrase, translateto, translatefrom) for phrase in longest_phrases]
|
| 217 |
+
|
| 218 |
+
# Convert the original and translated phrases to audio
|
| 219 |
+
audio_samples = {
|
| 220 |
+
"target_language": [text_to_audio(phrase, translateto) for phrase in longest_phrases],
|
| 221 |
+
"native_language": [text_to_audio(phrase, translatefrom) for phrase in translated_phrases]
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
return audio_output, processed_text, audio_samples, longest_phrases, translated_phrases
|
| 225 |
+
|
| 226 |
|
| 227 |
def main():
|
| 228 |
with gr.Blocks() as demo:
|
|
|
|
| 244 |
process_button = gr.Button("🌟AyaTonic")
|
| 245 |
|
| 246 |
processed_text_output = RichTextbox(label="Processed Text")
|
| 247 |
+
longest_phrases_1 = gr.Textbox(label="Focus")
|
| 248 |
+
translated_phrases_output_1 = gr.Textbox(label="Translated Phrases")
|
| 249 |
+
audio_output_native_phrase_1 = gr.Audio(label="Audio Output (Native Language)")
|
| 250 |
+
audio_output_target_phrase_1 = gr.Audio(label="Audio Output (Target Language)")
|
| 251 |
+
longest_phrases_2 = gr.Textbox(label="Focus")
|
| 252 |
+
translated_phrases_output_2 = gr.Textbox(label="Translated Phrases")
|
| 253 |
+
audio_output_native_phrase_2 = gr.Audio(label="Audio Output (Native Language)")
|
| 254 |
+
audio_output_target_phrase_2 = gr.Audio(label="Audio Output (Target Language)")
|
| 255 |
+
longest_phrases_3 = gr.Textbox(label="Focus")
|
| 256 |
+
translated_phrases_output_3 = gr.Textbox(label="Translated Phrases")
|
| 257 |
+
audio_output_native_phrase_3 = gr.Audio(label="Audio Output (Native Language)")
|
| 258 |
+
audio_output_target_phrase_3 = gr.Audio(label="Audio Output (Target Language)")
|
| 259 |
+
|
| 260 |
+
def update_outputs(image, file, audio, text, input_language, target_language):
|
| 261 |
+
processed_text, audio_samples, longest_phrases, translated_phrases = process_input(
|
| 262 |
+
image, file, audio, text, input_language, target_language)
|
| 263 |
+
return processed_text, audio_samples['native_language'], audio_samples['target_language'], "\n".join(longest_phrases), "\n".join(translated_phrases) # Fix this
|
| 264 |
|
| 265 |
process_button.click(
|
| 266 |
+
fn=update_outputs,
|
| 267 |
inputs=[image_input, file_input, audio_input, text_input, input_language, target_language],
|
| 268 |
+
outputs=[processed_text_output, audio_output_native_phrase_1, audio_output_target_phrase_1, longest_phrases_1, translated_phrases_output_1, audio_output_native_phrase_2, audio_output_target_phrase_2, longest_phrases_2, translated_phrases_output_2, audio_output_native_phrase_3, audio_output_target_phrase_3, longest_phrases_3, translated_phrases_output_3] #add education output
|
| 269 |
)
|
| 270 |
|
| 271 |
if __name__ == "__main__":
|
languages.json
DELETED
|
File without changes
|