Prajwal3009 commited on
Commit
aad25fe
·
verified ·
1 Parent(s): 4227b5b

Upload 12 files

Browse files
backend/app/.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY = 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
backend/app/1.tflite ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10c95ea3eb9a7bb4cb8bddf6feb023250381008177ac162ce169694d05c317de
3
+ size 4126810
backend/app/__pycache__/speech.cpython-312.pyc ADDED
Binary file (4.68 kB). View file
 
backend/app/audio.wav ADDED
Binary file (255 kB). View file
 
backend/app/client.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import websockets
3
+ import pyaudio
4
+ import threading
5
+ import logging
6
+ import json
7
+ import time
8
+ import struct
9
+
10
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
+
12
+ # Audio configuration
13
+ FORMAT = pyaudio.paInt16
14
+ CHANNELS = 1
15
+ RATE = 16000
16
+ CHUNK = 1024
17
+ AUDIO_SERVER_URL = 'ws://localhost:8000/ws' # Your websocket URL
18
+ # Example: AUDIO_SERVER_URL = "ws://localhost:8080/ws/your_user_id"
19
+
20
+ async def audio_sender(queue, websocket):
21
+ while True:
22
+ audio_data = await queue.get()
23
+ await websocket.send(audio_data)
24
+
25
+ def record_audio_to_queue(queue, loop):
26
+ p = pyaudio.PyAudio()
27
+ stream = p.open(format=FORMAT,
28
+ channels=CHANNELS,
29
+ rate=RATE,
30
+ input=True,
31
+ frames_per_buffer=CHUNK)
32
+
33
+ # print("Recording audio...")
34
+
35
+ try:
36
+ while True:
37
+ data = stream.read(CHUNK)
38
+ asyncio.run_coroutine_threadsafe(queue.put(data), loop)
39
+ except Exception as e:
40
+ print(f"Error recording audio: {e}")
41
+ finally:
42
+ stream.stop_stream()
43
+ stream.close()
44
+ p.terminate()
45
+ asyncio.run_coroutine_threadsafe(queue.put(None), loop)
46
+
47
+ async def receive_messages(websocket):
48
+ try:
49
+ while True:
50
+ message = await websocket.recv()
51
+ # message = json.loads(message)
52
+ # if message.get('chatType') == 'transcription' or 'transcription_with_response' or 'ova_response_textual':
53
+ # logging.info(message.get('text', '\n'))
54
+ logging.info(message)
55
+ except websockets.ConnectionClosed:
56
+ print("Connection closed")
57
+ except Exception as e:
58
+ print(f"Error receiving message: {e}")
59
+
60
+ async def main():
61
+ async with websockets.connect(AUDIO_SERVER_URL) as websocket:
62
+ queue = asyncio.Queue()
63
+
64
+ loop = asyncio.get_event_loop()
65
+ audio_thread = threading.Thread(target=record_audio_to_queue, args=(queue, loop))
66
+ audio_thread.start()
67
+
68
+ await asyncio.gather(
69
+ audio_sender(queue, websocket),
70
+ receive_messages(websocket),
71
+ )
72
+
73
+ audio_thread.join()
74
+
75
+ if __name__ == "__main__":
76
+ asyncio.run(main())
backend/app/server.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import websockets
3
+ import pyaudio
4
+ import threading
5
+ import logging
6
+ import json
7
+ import time
8
+ import struct
9
+ import openai
10
+ from fastapi import FastAPI, WebSocket
11
+ from fastapi.responses import HTMLResponse
12
+ from openai import OpenAI
13
+ from dotenv import load_dotenv
14
+ import os
15
+ from fastapi.middleware.cors import CORSMiddleware
16
+ from speech import record_audio
17
+ from fastapi import FastAPI, File, UploadFile,Form
18
+ from fastapi.responses import JSONResponse
19
+
20
+ load_dotenv()
21
+ client = OpenAI()
22
+ OpenAI_API_KEY = os.getenv("OPENAI_API_KEY")
23
+
24
+ # Audio configuration
25
+ FORMAT = pyaudio.paInt16
26
+ CHANNELS = 1
27
+ RATE = 16000
28
+ CHUNK = 1024
29
+
30
+ # Initialize FastAPI
31
+ app = FastAPI()
32
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
33
+ app.add_middleware( CORSMiddleware, allow_origins=["http://localhost:3000"], # Allow requests from this origin
34
+ allow_credentials=True,
35
+ allow_methods=["*"],
36
+ allow_headers=["*"],
37
+ )
38
+ chat_history = []
39
+ # OpenAI API key
40
+ openai.api_key = OpenAI_API_KEY
41
+ @app.get("/api-key")
42
+ def get_api_key():
43
+ return {"API_KEY": os.getenv("OPENAI_API_KEY")}
44
+ @app.post("/upload")
45
+ async def upload_file(file: UploadFile = File(...)):
46
+ try:
47
+ contents = await file.read()
48
+ with open("audio.wav", "wb") as f:
49
+ f.write(contents) # Process the audio file with Whisper model
50
+ text = process_audio_with_whisper("audio.wav") # Generate response with GPT-4.0
51
+ if "generate an image" in text.lower():
52
+ image_url = generate_image_with_dalle(text)
53
+ chat_history.append({"type": "image", "content": image_url})
54
+ return JSONResponse(content={"image_url": image_url})
55
+ else:
56
+ response = generate_response_with_gpt4(text)
57
+ chat_history.append({"type": "text", "content": response})
58
+ return JSONResponse(content={"response": response})
59
+ except Exception as e:
60
+ logging.error(f"Error processing file: {e}")
61
+ return JSONResponse(content={"error": str(e)}, status_code=500)
62
+
63
+ @app.post("/text-input")
64
+ async def text_input(prompt: str = Form(...)):
65
+ try: # Determine if the user is asking for an image
66
+ if "generate an image" in prompt.lower() or "generate a realistic image" in prompt.lower():
67
+ image_url = generate_image_with_dalle(prompt)
68
+ chat_history.append({"type": "image", "content": image_url})
69
+ return JSONResponse(content={"image_url": image_url})
70
+ else: response = generate_response_with_gpt4(prompt)
71
+ chat_history.append({"type": "text", "content": response})
72
+ return JSONResponse(content={"response": response})
73
+ except Exception as e:
74
+ logging.error(f"Error processing text input: {e}")
75
+ return JSONResponse(content={"error": str(e)}, status_code=500)
76
+
77
+ @app.post("/image-url-input")
78
+ async def image_input(url: str = Form(...), prompt: str = Form(...)):
79
+ try:
80
+ image_url = url
81
+ response = process_image_with_gpt4(image_url, prompt)
82
+ chat_history.append({"type": "text", "content": response})
83
+ return JSONResponse(content={"response": response})
84
+ except Exception as e:
85
+ logging.error(f"Error processing image input: {e}")
86
+ return JSONResponse(content={"error": str(e)}, status_code=500)
87
+
88
+ @app.get("/chat-history")
89
+ async def get_chat_history():
90
+ return JSONResponse(content={"chat_history": chat_history})
91
+
92
+ filepath = "audio.wav"
93
+
94
+ def process_audio_with_whisper(filepath): # Save the audio data to a file
95
+ # with open("audio.wav", "wb") as f:
96
+ # f.write(audio_data) # Transcribe the audio file using OpenAI's Whisper model
97
+ try:
98
+ audio_file= open(filepath, "rb")
99
+ transcription = client.audio.transcriptions.create(
100
+ model="whisper-1",
101
+ file=audio_file,
102
+
103
+ )
104
+ print(transcription.text)
105
+ return transcription.text
106
+ except Exception as e:
107
+ logging.error(f"Error transcribing audio: {e}")
108
+ raise
109
+
110
+ def generate_response_with_gpt4(text):
111
+ try:
112
+ completion = client.chat.completions.create(
113
+ model="gpt-4-turbo",
114
+ messages=[
115
+ {"role": "system", "content": "You are a helpful assistant."},
116
+ {
117
+ "role": "user",
118
+ "content": text
119
+ }
120
+ ]
121
+ )
122
+ print(completion.choices[0].message.content)
123
+ return completion.choices[0].message.content
124
+ except Exception as e:
125
+ logging.error(f"Error generating response: {e}")
126
+ raise
127
+
128
+
129
+
130
+ # response.choices[0].text.strip()
131
+
132
+ def generate_image_with_dalle(prompt):
133
+ response = client.images.generate(
134
+ model="dall-e-3",
135
+ prompt=prompt,
136
+ size="1024x1024",
137
+ quality="hd",
138
+ n=1,
139
+ )
140
+ return response.data[0].url
141
+
142
+ def process_image_with_gpt4(url,text):
143
+ try:
144
+
145
+ completion = client.chat.completions.create(
146
+ model="gpt-4o",
147
+ messages=[
148
+ {
149
+ "role": "user",
150
+ "content": [
151
+ {"type": "text", "text": text},
152
+ {
153
+ "type": "image_url",
154
+ "image_url": {
155
+ "url": url,
156
+ }
157
+ },
158
+ ],
159
+ }
160
+ ],
161
+ )
162
+ return completion.choices[0].message.content
163
+ except Exception as e:
164
+ logging.error(f"Error processing image: {e}")
165
+ raise
166
+
167
+ if __name__ == "__main__":
168
+ import uvicorn
169
+ uvicorn.run(app, host="0.0.0.0", port=8000)
backend/app/speech.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pyaudio
2
+ import numpy as np
3
+ import tensorflow as tf
4
+ import zipfile
5
+ import wave
6
+ import time
7
+
8
+ # Audio stream configuration
9
+ FORMAT = pyaudio.paInt16 # 16-bit PCM
10
+ CHANNELS = 1 # Mono channel
11
+ RATE = 16000 # 16kHz sample rate
12
+ CHUNK = 1024 # Buffer size
13
+ TARGET_LENGTH = 15600
14
+ SILENCE_THRESHOLD = 5000 # 5 seconds of silence
15
+
16
+ audio_buffer = np.zeros(TARGET_LENGTH, dtype=np.float32)
17
+ model_path = '1.tflite'
18
+ interpreter = tf.lite.Interpreter(model_path=model_path)
19
+ interpreter.allocate_tensors()
20
+
21
+ input_details = interpreter.get_input_details()
22
+ output_details = interpreter.get_output_details()
23
+
24
+ waveform_input_index = input_details[0]['index']
25
+ scores_output_index = output_details[0]['index']
26
+
27
+ with zipfile.ZipFile(model_path) as z:
28
+ with z.open('yamnet_label_list.txt') as f:
29
+ labels = [line.decode('utf-8').strip() for line in f]
30
+
31
+ # Ensure the input tensor is correctly sized
32
+ interpreter.resize_tensor_input(waveform_input_index, [TARGET_LENGTH], strict=False)
33
+ interpreter.allocate_tensors()
34
+ # Initialize PyAudio
35
+ p = pyaudio.PyAudio()
36
+
37
+ def record_audio():
38
+ try:
39
+ # Open the audio stream
40
+ stream = p.open(format=FORMAT,
41
+ channels=CHANNELS,
42
+ rate=RATE,
43
+ input=True,
44
+ frames_per_buffer=CHUNK)
45
+
46
+ print("Recording... Press Ctrl+C to stop.")
47
+
48
+ # Open a .wav file to save the audio
49
+ wf = wave.open("audio.wav", 'wb')
50
+ wf.setnchannels(CHANNELS)
51
+ wf.setsampwidth(p.get_sample_size(FORMAT))
52
+ wf.setframerate(RATE)
53
+
54
+ last_speech_time = time.time()
55
+
56
+ # Continuously read from the stream and append to audio_data
57
+ while True:
58
+ audio_data = stream.read(CHUNK)
59
+ audio_chunk = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
60
+ audio_buffer = np.roll(audio_buffer, -len(audio_chunk))
61
+ audio_buffer[-len(audio_chunk):] = audio_chunk
62
+
63
+ # Write audio data to the .wav file
64
+ wf.writeframes(audio_data)
65
+
66
+ # Set the tensor data
67
+ interpreter.set_tensor(waveform_input_index, audio_buffer)
68
+
69
+ # Run the model
70
+ interpreter.invoke()
71
+ scores = interpreter.get_tensor(scores_output_index)
72
+
73
+ # Get the top classification result
74
+ top_class_index = scores.argmax()
75
+ prediction = labels[top_class_index]
76
+ print(prediction)
77
+
78
+ # Check for silence
79
+ if np.max(np.abs(audio_chunk)) > 0.01:
80
+ last_speech_time = time.time()
81
+ elif time.time() - last_speech_time > SILENCE_THRESHOLD / 1000:
82
+ print("Silence detected. Stopping recording.")
83
+ break
84
+
85
+ except KeyboardInterrupt:
86
+ # Handle the KeyboardInterrupt to stop recording
87
+ print("\nRecording stopped by user.")
88
+
89
+ finally:
90
+ # Stop and close the stream and terminate PyAudio
91
+ stream.stop_stream()
92
+ stream.close()
93
+ p.terminate()
94
+ wf.close()
95
+ print("Stream closed and resources released.")
96
+
97
+ return "audio.wav"
98
+
99
+ if __name__ == "__main__":
100
+ record_audio()
backend/app/trans.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ # Load environment variables from .env file
6
+ load_dotenv()
7
+
8
+ # Access environment variables
9
+ api_key = os.getenv('OPENAI_API_KEY')
10
+
11
+ client = OpenAI()
12
+
13
+ # audio_file = open("audio.wav", "rb")
14
+ # transcription = client.audio.transcriptions.create(
15
+ # model="whisper-1",
16
+ # file=audio_file
17
+ # )
18
+ # print(transcription.text)
19
+ def process_audio_with_whisper(): # Save the audio data to a file
20
+
21
+ with open("audio.wav", "rb") as audio_file:
22
+ transcription = client.audio.transcriptions.create(
23
+ model="whisper-1", file=audio_file
24
+ )
25
+ print(transcription.text)
26
+ return transcription.text
27
+
28
+ if __name__ == "__main__":
29
+ process_audio_with_whisper()
backend/app/uploaded_image.png ADDED
backend/app/yamnet_label_list.txt ADDED
File without changes
backend/docker.dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12.4
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+ COPY --chown=user .env .env
14
+
15
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
backend/requirements.txt ADDED
Binary file (218 Bytes). View file