Spaces:

ArseniyPerchik
/

Agent_Control_with_Language

Build error

App Files Files Community

ArseniyPerchik commited on Apr 23, 2025

Commit

8e6ca7f

1 Parent(s): 4b49a7c

Browse files

Files changed (3) hide show

app.py +33 -36
draft_1.ipynb +0 -258
draft_tts.py +21 -0

app.py CHANGED Viewed

@@ -170,47 +170,44 @@ def get_target_from_request(request_text):
     return 'No goal found.'
-def create_demo():
     # main blocks
-    with gr.Blocks(css=custom_css) as my_demo:
-        gr.Markdown("# Agent Control with Language")
-        gr.Markdown('Say the agent where to go and what to do')
-        with gr.Row():
-            with gr.Column():
-                request_audio = gr.Microphone(editable=False)
-                # send_btn = gr.Button(value='Send Request')
-                request_text = gr.Textbox(label="Request:", lines=2, interactive=False)
-                request_target = gr.Textbox(label='Target:', lines=2)
-                status = gr.Textbox(label='Status:', lines=2, elem_id="mytextbox")
-            with gr.Column():
-                output_env = gr.Video(label="Env:", autoplay=True)
-        with gr.Accordion("TODO List", open=False):
-            gr.Markdown("""
-            ## PLAN
-            - [x] to use audio as an input for requests
-            - [x] to learn a policy for navigation from location to location
-            - [x] to build an interface that will show the status of the request
-            - [ ] to incorporate a longer chain of goals; for example, go there and pick the package, then come back
-            - [ ] to introduce additional learnt capabilities
-            - [ ] to build more complex environments where the movement is not so straightforward
-            """)
-        # EVENTS:
-        # gr.on(triggers=["load"], fn=load_image_on_start, outputs=output_env_image)
-        # my_demo.load(fn=load_image_on_start, outputs=output_env_image)
-        my_demo.load(fn=create_standing_animation, outputs=output_env)
-        # request_audio.stream(fn=get_text_request, inputs=request_audio, outputs=request_text)
-        request_audio.stop_recording(fn=get_text_request, inputs=request_audio, outputs=request_text)
-        request_text.change(fn=get_target_from_request, inputs=request_text, outputs=request_target)
-        request_target.change(fn=move_agent, inputs=request_target, outputs=[output_env, status])
-        request_audio.stop_recording(lambda: None, outputs=request_audio)
-    return my_demo
 # ---------------------------- #
 # main
 # ---------------------------- #
-demo = create_demo()
 demo.launch()

     return 'No goal found.'
     # main blocks
+with gr.Blocks(css=custom_css) as demo:
+    gr.Markdown("# Agent Control with Language")
+    gr.Markdown('Say the agent where to go and what to do')
+    with gr.Row():
+        with gr.Column():
+            request_audio = gr.Microphone(editable=False)
+            # send_btn = gr.Button(value='Send Request')
+            request_text = gr.Textbox(label="Request:", lines=2, interactive=False)
+            request_target = gr.Textbox(label='Target:', lines=2)
+            status = gr.Textbox(label='Status:', lines=2, elem_id="mytextbox")
+        with gr.Column():
+            output_env = gr.Video(label="Env:", autoplay=True)
+    with gr.Accordion("TODO List", open=False):
+        gr.Markdown("""
+        ## PLAN
+        - [x] to use audio as an input for requests
+        - [x] to learn a policy for navigation from location to location
+        - [x] to build an interface that will show the status of the request
+        - [ ] to incorporate a longer chain of goals; for example, go there and pick the package, then come back
+        - [ ] to introduce additional learnt capabilities
+        - [ ] to build more complex environments where the movement is not so straightforward
+        """)
+    # EVENTS:
+    # gr.on(triggers=["load"], fn=load_image_on_start, outputs=output_env_image)
+    # my_demo.load(fn=load_image_on_start, outputs=output_env_image)
+    demo.load(fn=create_standing_animation, outputs=output_env)
+    # request_audio.stream(fn=get_text_request, inputs=request_audio, outputs=request_text)
+    request_audio.stop_recording(fn=get_text_request, inputs=request_audio, outputs=request_text)
+    request_text.change(fn=get_target_from_request, inputs=request_text, outputs=request_target)
+    request_target.change(fn=move_agent, inputs=request_target, outputs=[output_env, status])
+    request_audio.stop_recording(lambda: None, outputs=request_audio)
 # ---------------------------- #
 # main
 # ---------------------------- #
 demo.launch()

draft_1.ipynb DELETED Viewed

@@ -1,258 +0,0 @@
-{
- "cells": [
-  {
-   "metadata": {},
-   "cell_type": "markdown",
-   "source": "## FIRST CHECK",
-   "id": "518bcf10bfff3063"
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-04-21T15:45:34.883735Z",
-     "start_time": "2025-04-21T15:45:33.734296Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "# gradio app.py --watch-dirs app.py\n",
-    "\n",
-    "import gradio as gr\n",
-    "import numpy as np\n",
-    "import matplotlib.pyplot as plt\n",
-    "import matplotlib.animation as animation\n",
-    "import tempfile\n",
-    "import torch\n",
-    "from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline\n",
-    "import torchaudio\n",
-    "import torchaudio.transforms as T\n",
-    "from matplotlib.patches import Circle\n",
-    "from stable_baselines3 import SAC\n",
-    "from warehouse_env import WarehouseEnv\n",
-    "from types import SimpleNamespace"
-   ],
-   "id": "f861a8e81b92bceb",
-   "outputs": [],
-   "execution_count": 50
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-04-21T15:45:58.508916Z",
-     "start_time": "2025-04-21T15:45:53.686659Z"
-    }
-   },
-   "cell_type": "code",
-   "source": "asr_pipe_default = pipeline(\"automatic-speech-recognition\")",
-   "id": "90ddbbf24fac7b1f",
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "No model was supplied, defaulted to facebook/wav2vec2-base-960h and revision 22aad52 (https://huggingface.co/facebook/wav2vec2-base-960h).\n",
-      "Using a pipeline without specifying a model name and revision in production is not recommended.\n",
-      "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "Device set to use mps:0\n"
-     ]
-    }
-   ],
-   "execution_count": 51
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-04-21T15:46:03.873405Z",
-     "start_time": "2025-04-21T15:46:02.219145Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "\n",
-    "waveform, sample_rate = torchaudio.load(\"sample.wav\")\n",
-    "target_sr = 16000\n",
-    "resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sr, dtype=waveform.dtype)\n",
-    "waveform = resampler(waveform)\n",
-    "waveform_np = waveform.squeeze().numpy()\n",
-    "# sample = dataset[2][\"audio\"]\n",
-    "\n",
-    "# result = pipe(sample)\n",
-    "result = asr_pipe_default(waveform_np)\n",
-    "print(result[\"text\"])\n"
-   ],
-   "id": "75dbfd85403eb511",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "THIS IS A SIMPLE TEXT\n"
-     ]
-    }
-   ],
-   "execution_count": 52
-  },
-  {
-   "metadata": {},
-   "cell_type": "markdown",
-   "source": "## TO SAVE THE MODEL",
-   "id": "e0a9c2fd7bce280a"
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-04-21T15:51:20.114613Z",
-     "start_time": "2025-04-21T15:51:20.106995Z"
-    }
-   },
-   "cell_type": "code",
-   "source": "save_dir = './models_for_proj/wav2vec2-base-960h'",
-   "id": "10f2808d5da846b9",
-   "outputs": [],
-   "execution_count": 53
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-04-21T15:54:16.050333Z",
-     "start_time": "2025-04-21T15:54:12.432304Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n",
-    "\n",
-    "# Load pretrained model and processor\n",
-    "model = Wav2Vec2ForCTC.from_pretrained(\"facebook/wav2vec2-base-960h\")\n",
-    "processor = Wav2Vec2Processor.from_pretrained(\"facebook/wav2vec2-base-960h\")\n",
-    "\n",
-    "# Save locally\n",
-    "model.save_pretrained(save_dir)\n",
-    "processor.save_pretrained(save_dir)"
-   ],
-   "id": "c22c64edf17613a0",
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "[]"
-      ]
-     },
-     "execution_count": 57,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "execution_count": 57
-  },
-  {
-   "metadata": {},
-   "cell_type": "markdown",
-   "source": "## TO REUSE IT",
-   "id": "b2e0767904efbbb3"
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-04-21T15:59:35.714597Z",
-     "start_time": "2025-04-21T15:59:35.705418Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "import torchaudio\n",
-    "import torchaudio.transforms as T\n",
-    "\n",
-    "waveform, sample_rate = torchaudio.load(\"sample.wav\")\n",
-    "target_sr = 16000\n",
-    "resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sr, dtype=waveform.dtype)\n",
-    "waveform = resampler(waveform)\n",
-    "waveform_np = waveform.squeeze().numpy()"
-   ],
-   "id": "394c5b342a6510",
-   "outputs": [],
-   "execution_count": 61
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-04-21T15:59:36.498222Z",
-     "start_time": "2025-04-21T15:59:36.361763Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "import torch\n",
-    "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n",
-    "\n",
-    "save_dir = './models_for_proj/wav2vec2-base-960h'\n",
-    "\n",
-    "# load\n",
-    "model = Wav2Vec2ForCTC.from_pretrained(save_dir)\n",
-    "processor = Wav2Vec2Processor.from_pretrained(save_dir)\n",
-    "\n",
-    "# Preprocess\n",
-    "inputs = processor(waveform_np, sampling_rate=16000, return_tensors=\"pt\", padding=True)\n",
-    "\n",
-    "# Inference\n",
-    "with torch.no_grad():\n",
-    "    logits = model(**inputs).logits\n",
-    "\n",
-    "# Decode\n",
-    "predicted_ids = torch.argmax(logits, dim=-1)\n",
-    "transcription = processor.decode(predicted_ids[0])\n",
-    "\n",
-    "print(\"Transcription:\", transcription)\n"
-   ],
-   "id": "af430cf9e1e42318",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Transcription: THIS IS A SIMPLE TEXT\n"
-     ]
-    }
-   ],
-   "execution_count": 62
-  },
-  {
-   "metadata": {},
-   "cell_type": "code",
-   "outputs": [],
-   "execution_count": null,
-   "source": "",
-   "id": "113500626c003f89"
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

draft_tts.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from transformers import AutoProcessor, BarkModel
+import torch
+import scipy.io.wavfile
+# Load model and processor
+processor = AutoProcessor.from_pretrained("suno/bark")
+model = BarkModel.from_pretrained("suno/bark")
+# Input text
+text = "Hello! This is Bark speaking from Hugging Face."
+# Prepare inputs
+inputs = processor(text, return_tensors="pt")
+# Generate audio
+with torch.no_grad():
+    audio = model.generate(**inputs)
+# Save the waveform
+audio = audio.cpu().numpy().squeeze()
+scipy.io.wavfile.write("bark_output.wav", rate=22050, data=audio)