import gradio as gr import torch import os import requests ,json from dotenv import load_dotenv import torch.nn.functional as F from transformers import AutoTokenizer, AutoModel, WhisperProcessor, WhisperForConditionalGeneration import re import librosa device = ( torch.device("mps") if torch.backends.mps.is_available() else torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") ) tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") model = AutoModel.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") model = model.to(device) model_id = "openai/whisper-small" processor_transcribe = WhisperProcessor.from_pretrained(model_id) model_transcribe = WhisperForConditionalGeneration.from_pretrained(model_id) load_dotenv() secret_key = os.getenv("MY_SECRET_KEY") invoke_url = os.getenv("invoke_url") stream = True headers = { "Authorization": secret_key, "Accept": "text/event-stream" if stream else "application/json" } def gen_llm(text): payload = { "model": "meta/llama-4-maverick-17b-128e-instruct", "messages": [{"role":"user","content":text}], "max_tokens": 512, "temperature": 1.00, "top_p": 1.00, "frequency_penalty": 0.00, "presence_penalty": 0.00, "stream": stream } response = requests.post(invoke_url, headers=headers, json=payload, stream=stream) joined_text = [] full_text = [] if stream: for line in response.iter_lines(): if not line or not line.strip(): continue # skip empty lines try: # Decode bytes to string line_str = line.decode('utf-8') # Some streaming APIs prefix lines with "data: " if line_str.startswith("data: "): line_str = line_str[6:] # Skip keep-alive pings or [DONE] markers if line_str.strip() == "[DONE]": break # Parse JSON and extract content line_dict = json.loads(line_str) content = line_dict["choices"][0]["delta"].get("content") if content is not None: # print(content) full_text.append(content) except json.JSONDecodeError as e: return (f"JSON decode error: {e}") except (KeyError, IndexError, TypeError) as e: return (f"Malformed line or missing fields: {e}") else: return (response.json()) joined_text = ''.join(full_text) return joined_text def embed_text(texts): inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True) inputs = {key: val.to(device) for key, val in inputs.items()} with torch.no_grad(): outputs = model(**inputs) embeddings = outputs.last_hidden_state[:, 0, :] # [CLS] token embeddings = F.normalize(embeddings, p=2, dim=1) # Normalize for cosine similarity return embeddings def check_and_read_txt(file): if file is None: return gr.update(visible=False), "" try: with open(file.name, 'r') as f: content = f.read() corpus_list = [s.strip() for s in re.split(r'\.\s+', content) if s.strip()] global corpus corpus = corpus_list[0].split('\n') corpus_embeddings = embed_text(corpus) global state_global state_global = corpus_embeddings return gr.update(visible=True), content, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) except Exception as e: return gr.update(visible=True), f"Error reading file: {str(e)}", gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) def save_text(input_text): corpus_list = [s.strip() for s in re.split(r'\.\s+', input_text) if s.strip()] global corpus corpus = corpus_list[0].split('\n') corpus_embeddings = embed_text(corpus) global state_global state_global = corpus_embeddings return input_text transcribe_list = ['english', 'chinese', 'german', 'spanish', 'russian', 'korean', 'french', 'japanese', 'portuguese', 'turkish', 'polish', 'catalan', 'dutch', 'arabic', 'swedish', 'italian', 'indonesian', 'hindi', 'finnish', 'vietnamese', 'hebrew', 'ukrainian', 'greek', 'malay', 'czech', 'romanian', 'danish', 'hungarian', 'tamil', 'norwegian', 'thai', 'urdu', 'croatian', 'bulgarian', 'lithuanian', 'latin', 'maori', 'malayalam', 'welsh', 'slovak', 'telugu', 'persian', 'latvian', 'bengali', 'serbian', 'azerbaijani', 'slovenian', 'kannada', 'estonian', 'macedonian', 'breton', 'basque', 'icelandic', 'armenian', 'nepali', 'mongolian', 'bosnian', 'kazakh', 'albanian', 'swahili', 'galician', 'marathi', 'punjabi', 'sinhala', 'khmer', 'shona', 'yoruba', 'somali', 'afrikaans', 'occitan', 'georgian', 'belarusian', 'tajik', 'sindhi', 'gujarati', 'amharic', 'yiddish', 'lao', 'uzbek', 'faroese', 'haitian creole', 'pashto', 'turkmen', 'nynorsk', 'maltese', 'sanskrit', 'luxembourgish', 'myanmar', 'tibetan', 'tagalog', 'malagasy', 'assamese', 'tatar', 'hawaiian', 'lingala', 'hausa', 'bashkir', 'javanese', 'sundanese', 'cantonese', 'burmese', 'valencian', 'flemish', 'haitian', 'letzeburgesch', 'pushto', 'panjabi', 'moldavian', 'moldovan', 'sinhalese', 'castilian', 'mandarin'] global selected_lang selected_lang = "english" def transcribe(audio): if audio is None: return "No audio recorded." global selected_lang print("selected_lang",selected_lang) forced_decoder_ids = processor_transcribe.get_decoder_prompt_ids(language=selected_lang, task="transcribe") audio_array, sampling_rate = librosa.load(audio, sr=16000) inputs = processor_transcribe(audio_array, sampling_rate=16000, return_tensors="pt") input_features = inputs.input_features.to(model_transcribe.device) predicted_ids = model_transcribe.generate(input_features, forced_decoder_ids=forced_decoder_ids) transcription = processor_transcribe.batch_decode(predicted_ids, skip_special_tokens=True)[0] return transcription def select_lang(language): global selected_lang selected_lang = language chatbot = gr.Chatbot(label="💬 Chat",type="messages", visible=False) chat_input = gr.Textbox(label="Enter text here:", visible=False) file_input = gr.File(label="Upload File", file_types=[".txt", "*"]) output_text = gr.Textbox(label="Result", lines=20, interactive=True, visible=False) btn = gr.Button("Update Information", visible=False) transcribe_button = gr.Button("Transcribe", visible=False) transcribe_dropdown = gr.Dropdown(choices=transcribe_list, label="Transcibe Language", visible=False) audio_input = gr.Audio(sources="microphone", type="filepath", label="Record Audio", visible=False) state_text = gr.State("") global state_global state_global = "" global corpus corpus = [] with gr.Blocks() as demo: # gr.Markdown("### Upload a file to check if it's a .txt file") with gr.Row(): with gr.Column(scale=1): file_input.render() output_text.render() btn.render() file_input.change( fn=check_and_read_txt, inputs=file_input, outputs=[output_text, output_text, btn, chatbot, chat_input, transcribe_button,transcribe_dropdown,audio_input] ) btn.click( fn=save_text, inputs=output_text, outputs=state_text ) with gr.Column(scale=2): chatbot.render() chat_input.render() with gr.Row(): audio_input.render() transcribe_button.render() transcribe_button.click(fn=transcribe, inputs=audio_input, outputs=chat_input) transcribe_dropdown.render() transcribe_dropdown.change(fn=select_lang, inputs=transcribe_dropdown) def respond(message, history): if history is None: history = [] query_embedding = embed_text([message]) cosine_scores = torch.matmul(query_embedding, state_global.T).squeeze() top_k_indices = torch.topk(cosine_scores, k=3).indices context = [corpus[i] for i in top_k_indices] prompt = ( f"Use the following information to answer the user's question : '{context}'\n" f"This is chat history : {history}\n" f"Then answer this question : '{message}'\n" f"Respond clearly and helpfully as a assistant. Keep your response focused and informative, but not overly brief. No need to explain how you got the answer.\n" f"Answer in the same language as the user input. If the answer is not in the context, say something like: " f"'I do not quite understand. Could you please try rephrasing or describing it differently?'\n" ) # print(prompt) bot_ans = gen_llm(prompt) history.append({"role": "user", "content": message}) history.append({"role": "assistant", "content": bot_ans}) return history, "" # Return history and clear input chat_input.submit(fn=respond, inputs=[chat_input, chatbot], outputs=[chatbot, chat_input]) if __name__ == "__main__": demo.launch()