ir12345's picture
Upload app.py
0457db3 verified
import gradio as gr
import torch
import os
import requests ,json
from dotenv import load_dotenv
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, WhisperProcessor, WhisperForConditionalGeneration
import re
import librosa
device = (
torch.device("mps") if torch.backends.mps.is_available() else
torch.device("cuda") if torch.cuda.is_available() else
torch.device("cpu")
)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
model = AutoModel.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
model = model.to(device)
model_id = "openai/whisper-small"
processor_transcribe = WhisperProcessor.from_pretrained(model_id)
model_transcribe = WhisperForConditionalGeneration.from_pretrained(model_id)
load_dotenv()
secret_key = os.getenv("MY_SECRET_KEY")
invoke_url = os.getenv("invoke_url")
stream = True
headers = {
"Authorization": secret_key,
"Accept": "text/event-stream" if stream else "application/json"
}
def gen_llm(text):
payload = {
"model": "meta/llama-4-maverick-17b-128e-instruct",
"messages": [{"role":"user","content":text}],
"max_tokens": 512,
"temperature": 1.00,
"top_p": 1.00,
"frequency_penalty": 0.00,
"presence_penalty": 0.00,
"stream": stream
}
response = requests.post(invoke_url, headers=headers, json=payload, stream=stream)
joined_text = []
full_text = []
if stream:
for line in response.iter_lines():
if not line or not line.strip():
continue # skip empty lines
try:
# Decode bytes to string
line_str = line.decode('utf-8')
# Some streaming APIs prefix lines with "data: "
if line_str.startswith("data: "):
line_str = line_str[6:]
# Skip keep-alive pings or [DONE] markers
if line_str.strip() == "[DONE]":
break
# Parse JSON and extract content
line_dict = json.loads(line_str)
content = line_dict["choices"][0]["delta"].get("content")
if content is not None:
# print(content)
full_text.append(content)
except json.JSONDecodeError as e:
return (f"JSON decode error: {e}")
except (KeyError, IndexError, TypeError) as e:
return (f"Malformed line or missing fields: {e}")
else:
return (response.json())
joined_text = ''.join(full_text)
return joined_text
def embed_text(texts):
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
inputs = {key: val.to(device) for key, val in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state[:, 0, :] # [CLS] token
embeddings = F.normalize(embeddings, p=2, dim=1) # Normalize for cosine similarity
return embeddings
def check_and_read_txt(file):
if file is None:
return gr.update(visible=False), ""
try:
with open(file.name, 'r') as f:
content = f.read()
corpus_list = [s.strip() for s in re.split(r'\.\s+', content) if s.strip()]
global corpus
corpus = corpus_list[0].split('\n')
corpus_embeddings = embed_text(corpus)
global state_global
state_global = corpus_embeddings
return gr.update(visible=True), content, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
except Exception as e:
return gr.update(visible=True), f"Error reading file: {str(e)}", gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
def save_text(input_text):
corpus_list = [s.strip() for s in re.split(r'\.\s+', input_text) if s.strip()]
global corpus
corpus = corpus_list[0].split('\n')
corpus_embeddings = embed_text(corpus)
global state_global
state_global = corpus_embeddings
return input_text
transcribe_list = ['english', 'chinese', 'german', 'spanish', 'russian', 'korean', 'french', 'japanese', 'portuguese', 'turkish', 'polish', 'catalan', 'dutch', 'arabic', 'swedish', 'italian', 'indonesian', 'hindi', 'finnish', 'vietnamese', 'hebrew', 'ukrainian', 'greek', 'malay', 'czech', 'romanian', 'danish', 'hungarian', 'tamil', 'norwegian', 'thai', 'urdu', 'croatian', 'bulgarian', 'lithuanian', 'latin', 'maori', 'malayalam', 'welsh', 'slovak', 'telugu', 'persian', 'latvian', 'bengali', 'serbian', 'azerbaijani', 'slovenian', 'kannada', 'estonian', 'macedonian', 'breton', 'basque', 'icelandic', 'armenian', 'nepali', 'mongolian', 'bosnian', 'kazakh', 'albanian', 'swahili', 'galician', 'marathi', 'punjabi', 'sinhala', 'khmer', 'shona', 'yoruba', 'somali', 'afrikaans', 'occitan', 'georgian', 'belarusian', 'tajik', 'sindhi', 'gujarati', 'amharic', 'yiddish', 'lao', 'uzbek', 'faroese', 'haitian creole', 'pashto', 'turkmen', 'nynorsk', 'maltese', 'sanskrit', 'luxembourgish', 'myanmar', 'tibetan', 'tagalog', 'malagasy', 'assamese', 'tatar', 'hawaiian', 'lingala', 'hausa', 'bashkir', 'javanese', 'sundanese', 'cantonese', 'burmese', 'valencian', 'flemish', 'haitian', 'letzeburgesch', 'pushto', 'panjabi', 'moldavian', 'moldovan', 'sinhalese', 'castilian', 'mandarin']
global selected_lang
selected_lang = "english"
def transcribe(audio):
if audio is None:
return "No audio recorded."
global selected_lang
print("selected_lang",selected_lang)
forced_decoder_ids = processor_transcribe.get_decoder_prompt_ids(language=selected_lang, task="transcribe")
audio_array, sampling_rate = librosa.load(audio, sr=16000)
inputs = processor_transcribe(audio_array, sampling_rate=16000, return_tensors="pt")
input_features = inputs.input_features.to(model_transcribe.device)
predicted_ids = model_transcribe.generate(input_features, forced_decoder_ids=forced_decoder_ids)
transcription = processor_transcribe.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcription
def select_lang(language):
global selected_lang
selected_lang = language
chatbot = gr.Chatbot(label="💬 Chat",type="messages", visible=False)
chat_input = gr.Textbox(label="Enter text here:", visible=False)
file_input = gr.File(label="Upload File", file_types=[".txt", "*"])
output_text = gr.Textbox(label="Result", lines=20, interactive=True, visible=False)
btn = gr.Button("Update Information", visible=False)
transcribe_button = gr.Button("Transcribe", visible=False)
transcribe_dropdown = gr.Dropdown(choices=transcribe_list, label="Transcibe Language", visible=False)
audio_input = gr.Audio(sources="microphone", type="filepath", label="Record Audio", visible=False)
state_text = gr.State("")
global state_global
state_global = ""
global corpus
corpus = []
with gr.Blocks() as demo:
# gr.Markdown("### Upload a file to check if it's a .txt file")
with gr.Row():
with gr.Column(scale=1):
file_input.render()
output_text.render()
btn.render()
file_input.change(
fn=check_and_read_txt,
inputs=file_input,
outputs=[output_text, output_text, btn, chatbot, chat_input, transcribe_button,transcribe_dropdown,audio_input]
)
btn.click(
fn=save_text,
inputs=output_text,
outputs=state_text
)
with gr.Column(scale=2):
chatbot.render()
chat_input.render()
with gr.Row():
audio_input.render()
transcribe_button.render()
transcribe_button.click(fn=transcribe, inputs=audio_input, outputs=chat_input)
transcribe_dropdown.render()
transcribe_dropdown.change(fn=select_lang, inputs=transcribe_dropdown)
def respond(message, history):
if history is None:
history = []
query_embedding = embed_text([message])
cosine_scores = torch.matmul(query_embedding, state_global.T).squeeze()
top_k_indices = torch.topk(cosine_scores, k=3).indices
context = [corpus[i] for i in top_k_indices]
prompt = (
f"Use the following information to answer the user's question : '{context}'\n"
f"This is chat history : {history}\n"
f"Then answer this question : '{message}'\n"
f"Respond clearly and helpfully as a assistant. Keep your response focused and informative, but not overly brief. No need to explain how you got the answer.\n"
f"Answer in the same language as the user input. If the answer is not in the context, say something like: "
f"'I do not quite understand. Could you please try rephrasing or describing it differently?'\n"
)
# print(prompt)
bot_ans = gen_llm(prompt)
history.append({"role": "user", "content": message})
history.append({"role": "assistant", "content": bot_ans})
return history, "" # Return history and clear input
chat_input.submit(fn=respond, inputs=[chat_input, chatbot], outputs=[chatbot, chat_input])
if __name__ == "__main__":
demo.launch()