Spaces:
Runtime error
Runtime error
Commit ·
4db1bc1
1
Parent(s): 2b9e17f
optional ai cleaning and add md link regex cleaning by default
Browse files
app.ipynb
CHANGED
|
@@ -142,7 +142,7 @@
|
|
| 142 |
},
|
| 143 |
{
|
| 144 |
"cell_type": "code",
|
| 145 |
-
"execution_count":
|
| 146 |
"id": "4f486d3a",
|
| 147 |
"metadata": {},
|
| 148 |
"outputs": [],
|
|
@@ -168,7 +168,8 @@
|
|
| 168 |
"# from cartesia.tts import CartesiaTTS\n",
|
| 169 |
"import cartesia\n",
|
| 170 |
"import requests\n",
|
| 171 |
-
"import urllib"
|
|
|
|
| 172 |
]
|
| 173 |
},
|
| 174 |
{
|
|
@@ -810,7 +811,19 @@
|
|
| 810 |
},
|
| 811 |
{
|
| 812 |
"cell_type": "code",
|
| 813 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 814 |
"id": "db54a6a6-4bdc-430a-b1ea-444c249b77fb",
|
| 815 |
"metadata": {},
|
| 816 |
"outputs": [],
|
|
@@ -820,12 +833,12 @@
|
|
| 820 |
" # result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))\n",
|
| 821 |
" result = requests.get('https://r.jina.ai/'+url)\n",
|
| 822 |
" result.raise_for_status()\n",
|
| 823 |
-
" return result.text"
|
| 824 |
]
|
| 825 |
},
|
| 826 |
{
|
| 827 |
"cell_type": "code",
|
| 828 |
-
"execution_count":
|
| 829 |
"id": "75891855-6c08-4a42-9ad5-a02e0b43bb3d",
|
| 830 |
"metadata": {},
|
| 831 |
"outputs": [],
|
|
@@ -858,6 +871,8 @@
|
|
| 858 |
"\n",
|
| 859 |
" idx = 0\n",
|
| 860 |
" while complete == False and idx < max_iters:\n",
|
|
|
|
|
|
|
| 861 |
" idx += 1\n",
|
| 862 |
" response = client.chat.completions.create(\n",
|
| 863 |
" model=\"gpt-4o\",\n",
|
|
@@ -895,19 +910,22 @@
|
|
| 895 |
},
|
| 896 |
{
|
| 897 |
"cell_type": "code",
|
| 898 |
-
"execution_count":
|
| 899 |
"id": "7899e7b2-beeb-40a4-a571-a2ccfc7c9618",
|
| 900 |
"metadata": {},
|
| 901 |
"outputs": [],
|
| 902 |
"source": [
|
| 903 |
"#| export\n",
|
| 904 |
-
"def get_page_text(url):\n",
|
| 905 |
-
"
|
|
|
|
|
|
|
|
|
|
| 906 |
]
|
| 907 |
},
|
| 908 |
{
|
| 909 |
"cell_type": "code",
|
| 910 |
-
"execution_count":
|
| 911 |
"id": "e4fb3159-579b-4271-bc96-4cd1e2816eca",
|
| 912 |
"metadata": {},
|
| 913 |
"outputs": [],
|
|
@@ -918,9 +936,11 @@
|
|
| 918 |
" ### Define UI ###\n",
|
| 919 |
" gr.Markdown(\"# TTS\")\n",
|
| 920 |
" gr.Markdown(\"\"\"Start typing below and then click **Go** to create the speech from your text.\n",
|
| 921 |
-
"For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href=\"https://matdmiller-tts-openai.hf.space/\" target=\"_blank\">Spaces Link HTML</a>\
|
|
|
|
| 922 |
" with gr.Row():\n",
|
| 923 |
" input_url = gr.Textbox(max_lines=1, label=\"Optional - Enter a URL\")\n",
|
|
|
|
| 924 |
" get_url_content_btn = gr.Button(\"Get URL Contents\")\n",
|
| 925 |
" with gr.Row():\n",
|
| 926 |
" input_text = gr.Textbox(max_lines=100, label=\"Enter text here\")\n",
|
|
@@ -946,7 +966,7 @@
|
|
| 946 |
"\n",
|
| 947 |
" ### Define UI Actions ###\n",
|
| 948 |
"\n",
|
| 949 |
-
" get_url_content_btn.click(fn=get_page_text, inputs=input_url, outputs=input_text)\n",
|
| 950 |
" \n",
|
| 951 |
" # input_text \n",
|
| 952 |
" input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)\n",
|
|
@@ -982,7 +1002,7 @@
|
|
| 982 |
},
|
| 983 |
{
|
| 984 |
"cell_type": "code",
|
| 985 |
-
"execution_count":
|
| 986 |
"id": "a00648a1-891b-470b-9959-f5d502055713",
|
| 987 |
"metadata": {},
|
| 988 |
"outputs": [],
|
|
@@ -996,7 +1016,7 @@
|
|
| 996 |
},
|
| 997 |
{
|
| 998 |
"cell_type": "code",
|
| 999 |
-
"execution_count":
|
| 1000 |
"id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
|
| 1001 |
"metadata": {},
|
| 1002 |
"outputs": [
|
|
@@ -1025,16 +1045,9 @@
|
|
| 1025 |
"data": {
|
| 1026 |
"text/plain": []
|
| 1027 |
},
|
| 1028 |
-
"execution_count":
|
| 1029 |
"metadata": {},
|
| 1030 |
"output_type": "execute_result"
|
| 1031 |
-
},
|
| 1032 |
-
{
|
| 1033 |
-
"name": "stdout",
|
| 1034 |
-
"output_type": "stream",
|
| 1035 |
-
"text": [
|
| 1036 |
-
"TOKENS CLEANUP: 970\n"
|
| 1037 |
-
]
|
| 1038 |
}
|
| 1039 |
],
|
| 1040 |
"source": [
|
|
@@ -1060,7 +1073,7 @@
|
|
| 1060 |
},
|
| 1061 |
{
|
| 1062 |
"cell_type": "code",
|
| 1063 |
-
"execution_count":
|
| 1064 |
"id": "28e8d888-e790-46fa-bbac-4511b9ab796c",
|
| 1065 |
"metadata": {},
|
| 1066 |
"outputs": [
|
|
@@ -1090,7 +1103,7 @@
|
|
| 1090 |
},
|
| 1091 |
{
|
| 1092 |
"cell_type": "code",
|
| 1093 |
-
"execution_count":
|
| 1094 |
"id": "0420310d-930b-4904-8bd4-3458ad8bdbd3",
|
| 1095 |
"metadata": {},
|
| 1096 |
"outputs": [],
|
|
|
|
| 142 |
},
|
| 143 |
{
|
| 144 |
"cell_type": "code",
|
| 145 |
+
"execution_count": 41,
|
| 146 |
"id": "4f486d3a",
|
| 147 |
"metadata": {},
|
| 148 |
"outputs": [],
|
|
|
|
| 168 |
"# from cartesia.tts import CartesiaTTS\n",
|
| 169 |
"import cartesia\n",
|
| 170 |
"import requests\n",
|
| 171 |
+
"import urllib\n",
|
| 172 |
+
"import re"
|
| 173 |
]
|
| 174 |
},
|
| 175 |
{
|
|
|
|
| 811 |
},
|
| 812 |
{
|
| 813 |
"cell_type": "code",
|
| 814 |
+
"execution_count": 42,
|
| 815 |
+
"id": "c5b0156a-f6d4-480a-b7b5-b0899e7520b9",
|
| 816 |
+
"metadata": {},
|
| 817 |
+
"outputs": [],
|
| 818 |
+
"source": [
|
| 819 |
+
"#| export\n",
|
| 820 |
+
"def remove_urls_from_markdown(text):\n",
|
| 821 |
+
" return re.sub(r'\\[([^\\]]+)\\]\\([^\\)]+\\)', r'\\1', text)"
|
| 822 |
+
]
|
| 823 |
+
},
|
| 824 |
+
{
|
| 825 |
+
"cell_type": "code",
|
| 826 |
+
"execution_count": 43,
|
| 827 |
"id": "db54a6a6-4bdc-430a-b1ea-444c249b77fb",
|
| 828 |
"metadata": {},
|
| 829 |
"outputs": [],
|
|
|
|
| 833 |
" # result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))\n",
|
| 834 |
" result = requests.get('https://r.jina.ai/'+url)\n",
|
| 835 |
" result.raise_for_status()\n",
|
| 836 |
+
" return remove_urls_from_markdown(result.text)"
|
| 837 |
]
|
| 838 |
},
|
| 839 |
{
|
| 840 |
"cell_type": "code",
|
| 841 |
+
"execution_count": 47,
|
| 842 |
"id": "75891855-6c08-4a42-9ad5-a02e0b43bb3d",
|
| 843 |
"metadata": {},
|
| 844 |
"outputs": [],
|
|
|
|
| 871 |
"\n",
|
| 872 |
" idx = 0\n",
|
| 873 |
" while complete == False and idx < max_iters:\n",
|
| 874 |
+
" print('Page Cleaning Iter:',idx)\n",
|
| 875 |
+
" assert idx < max_iters\n",
|
| 876 |
" idx += 1\n",
|
| 877 |
" response = client.chat.completions.create(\n",
|
| 878 |
" model=\"gpt-4o\",\n",
|
|
|
|
| 910 |
},
|
| 911 |
{
|
| 912 |
"cell_type": "code",
|
| 913 |
+
"execution_count": 48,
|
| 914 |
"id": "7899e7b2-beeb-40a4-a571-a2ccfc7c9618",
|
| 915 |
"metadata": {},
|
| 916 |
"outputs": [],
|
| 917 |
"source": [
|
| 918 |
"#| export\n",
|
| 919 |
+
"def get_page_text(url:str, ai_clean:bool):\n",
|
| 920 |
+
" text = get_page_md(url)\n",
|
| 921 |
+
" if ai_clean:\n",
|
| 922 |
+
" text = clean_page_md(text)\n",
|
| 923 |
+
" return text"
|
| 924 |
]
|
| 925 |
},
|
| 926 |
{
|
| 927 |
"cell_type": "code",
|
| 928 |
+
"execution_count": 50,
|
| 929 |
"id": "e4fb3159-579b-4271-bc96-4cd1e2816eca",
|
| 930 |
"metadata": {},
|
| 931 |
"outputs": [],
|
|
|
|
| 936 |
" ### Define UI ###\n",
|
| 937 |
" gr.Markdown(\"# TTS\")\n",
|
| 938 |
" gr.Markdown(\"\"\"Start typing below and then click **Go** to create the speech from your text.\n",
|
| 939 |
+
"For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href=\"https://matdmiller-tts-openai.hf.space/\" target=\"_blank\">Spaces Link HTML</a>\n",
|
| 940 |
+
"[https://r.jina.ai/](https://r.jina.ai/)\"\"\")\n",
|
| 941 |
" with gr.Row():\n",
|
| 942 |
" input_url = gr.Textbox(max_lines=1, label=\"Optional - Enter a URL\")\n",
|
| 943 |
+
" input_clean_cb = gr.Checkbox(value=False, label='AI Clean Text')\n",
|
| 944 |
" get_url_content_btn = gr.Button(\"Get URL Contents\")\n",
|
| 945 |
" with gr.Row():\n",
|
| 946 |
" input_text = gr.Textbox(max_lines=100, label=\"Enter text here\")\n",
|
|
|
|
| 966 |
"\n",
|
| 967 |
" ### Define UI Actions ###\n",
|
| 968 |
"\n",
|
| 969 |
+
" get_url_content_btn.click(fn=get_page_text, inputs=[input_url,input_clean_cb], outputs=input_text)\n",
|
| 970 |
" \n",
|
| 971 |
" # input_text \n",
|
| 972 |
" input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)\n",
|
|
|
|
| 1002 |
},
|
| 1003 |
{
|
| 1004 |
"cell_type": "code",
|
| 1005 |
+
"execution_count": 51,
|
| 1006 |
"id": "a00648a1-891b-470b-9959-f5d502055713",
|
| 1007 |
"metadata": {},
|
| 1008 |
"outputs": [],
|
|
|
|
| 1016 |
},
|
| 1017 |
{
|
| 1018 |
"cell_type": "code",
|
| 1019 |
+
"execution_count": 52,
|
| 1020 |
"id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
|
| 1021 |
"metadata": {},
|
| 1022 |
"outputs": [
|
|
|
|
| 1045 |
"data": {
|
| 1046 |
"text/plain": []
|
| 1047 |
},
|
| 1048 |
+
"execution_count": 52,
|
| 1049 |
"metadata": {},
|
| 1050 |
"output_type": "execute_result"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1051 |
}
|
| 1052 |
],
|
| 1053 |
"source": [
|
|
|
|
| 1073 |
},
|
| 1074 |
{
|
| 1075 |
"cell_type": "code",
|
| 1076 |
+
"execution_count": 45,
|
| 1077 |
"id": "28e8d888-e790-46fa-bbac-4511b9ab796c",
|
| 1078 |
"metadata": {},
|
| 1079 |
"outputs": [
|
|
|
|
| 1103 |
},
|
| 1104 |
{
|
| 1105 |
"cell_type": "code",
|
| 1106 |
+
"execution_count": 53,
|
| 1107 |
"id": "0420310d-930b-4904-8bd4-3458ad8bdbd3",
|
| 1108 |
"metadata": {},
|
| 1109 |
"outputs": [],
|
app.py
CHANGED
|
@@ -6,7 +6,8 @@ __all__ = ['secret_import_failed', 'TEMP', 'TEMP_DIR', 'OPENAI_CLIENT_TTS_THREAD
|
|
| 6 |
'launch_kwargs', 'queue_kwargs', 'verify_authorization', 'split_text', 'concatenate_audio',
|
| 7 |
'create_speech_openai', 'create_speech_cartesiaai', 'create_speech', 'get_input_text_len',
|
| 8 |
'get_generation_cost', 'get_model_choices', 'update_model_choices', 'get_voice_choices',
|
| 9 |
-
'update_voice_choices', 'split_text_as_md', '
|
|
|
|
| 10 |
|
| 11 |
# %% app.ipynb 4
|
| 12 |
import os
|
|
@@ -72,6 +73,7 @@ import traceback
|
|
| 72 |
import cartesia
|
| 73 |
import requests
|
| 74 |
import urllib
|
|
|
|
| 75 |
|
| 76 |
# %% app.ipynb 11
|
| 77 |
TEMP = os.environ.get('GRADIO_TEMP_DIR','/tmp/')
|
|
@@ -333,13 +335,17 @@ def split_text_as_md(*args, **kwargs):
|
|
| 333 |
return '# Text Splits:\n' + '<br>----------<br>'.join(output)
|
| 334 |
|
| 335 |
# %% app.ipynb 38
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
def get_page_md(url):
|
| 337 |
# result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))
|
| 338 |
result = requests.get('https://r.jina.ai/'+url)
|
| 339 |
result.raise_for_status()
|
| 340 |
-
return result.text
|
| 341 |
|
| 342 |
-
# %% app.ipynb
|
| 343 |
# import json
|
| 344 |
def clean_page_md(text):
|
| 345 |
max_iters = 15
|
|
@@ -356,6 +362,8 @@ def clean_page_md(text):
|
|
| 356 |
|
| 357 |
idx = 0
|
| 358 |
while complete == False and idx < max_iters:
|
|
|
|
|
|
|
| 359 |
idx += 1
|
| 360 |
response = client.chat.completions.create(
|
| 361 |
model="gpt-4o",
|
|
@@ -380,19 +388,24 @@ def clean_page_md(text):
|
|
| 380 |
# res = clean_page_md(test_page_md)
|
| 381 |
# res
|
| 382 |
|
| 383 |
-
# %% app.ipynb 42
|
| 384 |
-
def get_page_text(url):
|
| 385 |
-
return clean_page_md(get_page_md(url))
|
| 386 |
-
|
| 387 |
# %% app.ipynb 43
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:
|
| 389 |
|
| 390 |
### Define UI ###
|
| 391 |
gr.Markdown("# TTS")
|
| 392 |
gr.Markdown("""Start typing below and then click **Go** to create the speech from your text.
|
| 393 |
-
For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href="https://matdmiller-tts-openai.hf.space/" target="_blank">Spaces Link HTML</a>
|
|
|
|
| 394 |
with gr.Row():
|
| 395 |
input_url = gr.Textbox(max_lines=1, label="Optional - Enter a URL")
|
|
|
|
| 396 |
get_url_content_btn = gr.Button("Get URL Contents")
|
| 397 |
with gr.Row():
|
| 398 |
input_text = gr.Textbox(max_lines=100, label="Enter text here")
|
|
@@ -418,7 +431,7 @@ For requests longer than allowed by the API they will be broken into chunks auto
|
|
| 418 |
|
| 419 |
### Define UI Actions ###
|
| 420 |
|
| 421 |
-
get_url_content_btn.click(fn=get_page_text, inputs=input_url, outputs=input_text)
|
| 422 |
|
| 423 |
# input_text
|
| 424 |
input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)
|
|
@@ -451,13 +464,13 @@ For requests longer than allowed by the API they will be broken into chunks auto
|
|
| 451 |
|
| 452 |
|
| 453 |
|
| 454 |
-
# %% app.ipynb
|
| 455 |
# launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
|
| 456 |
# 'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
|
| 457 |
launch_kwargs = {}
|
| 458 |
queue_kwargs = {'default_concurrency_limit':10}
|
| 459 |
|
| 460 |
-
# %% app.ipynb
|
| 461 |
#.py launch
|
| 462 |
if __name__ == "__main__":
|
| 463 |
app.queue(**queue_kwargs)
|
|
|
|
| 6 |
'launch_kwargs', 'queue_kwargs', 'verify_authorization', 'split_text', 'concatenate_audio',
|
| 7 |
'create_speech_openai', 'create_speech_cartesiaai', 'create_speech', 'get_input_text_len',
|
| 8 |
'get_generation_cost', 'get_model_choices', 'update_model_choices', 'get_voice_choices',
|
| 9 |
+
'update_voice_choices', 'split_text_as_md', 'remove_urls_from_markdown', 'get_page_md', 'clean_page_md',
|
| 10 |
+
'get_page_text']
|
| 11 |
|
| 12 |
# %% app.ipynb 4
|
| 13 |
import os
|
|
|
|
| 73 |
import cartesia
|
| 74 |
import requests
|
| 75 |
import urllib
|
| 76 |
+
import re
|
| 77 |
|
| 78 |
# %% app.ipynb 11
|
| 79 |
TEMP = os.environ.get('GRADIO_TEMP_DIR','/tmp/')
|
|
|
|
| 335 |
return '# Text Splits:\n' + '<br>----------<br>'.join(output)
|
| 336 |
|
| 337 |
# %% app.ipynb 38
|
| 338 |
+
def remove_urls_from_markdown(text):
|
| 339 |
+
return re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
| 340 |
+
|
| 341 |
+
# %% app.ipynb 39
|
| 342 |
def get_page_md(url):
|
| 343 |
# result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))
|
| 344 |
result = requests.get('https://r.jina.ai/'+url)
|
| 345 |
result.raise_for_status()
|
| 346 |
+
return remove_urls_from_markdown(result.text)
|
| 347 |
|
| 348 |
+
# %% app.ipynb 41
|
| 349 |
# import json
|
| 350 |
def clean_page_md(text):
|
| 351 |
max_iters = 15
|
|
|
|
| 362 |
|
| 363 |
idx = 0
|
| 364 |
while complete == False and idx < max_iters:
|
| 365 |
+
print('Page Cleaning Iter:',idx)
|
| 366 |
+
assert idx < max_iters
|
| 367 |
idx += 1
|
| 368 |
response = client.chat.completions.create(
|
| 369 |
model="gpt-4o",
|
|
|
|
| 388 |
# res = clean_page_md(test_page_md)
|
| 389 |
# res
|
| 390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
# %% app.ipynb 43
|
| 392 |
+
def get_page_text(url:str, ai_clean:bool):
|
| 393 |
+
text = get_page_md(url)
|
| 394 |
+
if ai_clean:
|
| 395 |
+
text = clean_page_md(text)
|
| 396 |
+
return text
|
| 397 |
+
|
| 398 |
+
# %% app.ipynb 44
|
| 399 |
with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:
|
| 400 |
|
| 401 |
### Define UI ###
|
| 402 |
gr.Markdown("# TTS")
|
| 403 |
gr.Markdown("""Start typing below and then click **Go** to create the speech from your text.
|
| 404 |
+
For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href="https://matdmiller-tts-openai.hf.space/" target="_blank">Spaces Link HTML</a>
|
| 405 |
+
[https://r.jina.ai/](https://r.jina.ai/)""")
|
| 406 |
with gr.Row():
|
| 407 |
input_url = gr.Textbox(max_lines=1, label="Optional - Enter a URL")
|
| 408 |
+
input_clean_cb = gr.Checkbox(value=False, label='AI Clean Text')
|
| 409 |
get_url_content_btn = gr.Button("Get URL Contents")
|
| 410 |
with gr.Row():
|
| 411 |
input_text = gr.Textbox(max_lines=100, label="Enter text here")
|
|
|
|
| 431 |
|
| 432 |
### Define UI Actions ###
|
| 433 |
|
| 434 |
+
get_url_content_btn.click(fn=get_page_text, inputs=[input_url,input_clean_cb], outputs=input_text)
|
| 435 |
|
| 436 |
# input_text
|
| 437 |
input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)
|
|
|
|
| 464 |
|
| 465 |
|
| 466 |
|
| 467 |
+
# %% app.ipynb 45
|
| 468 |
# launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
|
| 469 |
# 'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
|
| 470 |
launch_kwargs = {}
|
| 471 |
queue_kwargs = {'default_concurrency_limit':10}
|
| 472 |
|
| 473 |
+
# %% app.ipynb 47
|
| 474 |
#.py launch
|
| 475 |
if __name__ == "__main__":
|
| 476 |
app.queue(**queue_kwargs)
|