Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| import gradio as gr | |
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer, util | |
| import torch | |
| import openai # New import for Whisper API | |
| # 載入語義搜索模型 | |
| model_checkpoint = "sickcell69/cti-semantic-search-minilm" | |
| #model_checkpoint = "sickcell69/bert-finetuned-ner" | |
| model = SentenceTransformer(model_checkpoint) | |
| # 載入數據 | |
| data_path = 'labeled_cti_data.json' | |
| data = pd.read_json(data_path) | |
| # 載入嵌入文件 | |
| embeddings_path = 'corpus_embeddings.pt' | |
| corpus_embeddings = torch.load(embeddings_path) | |
| def semantic_search(query): | |
| query_embedding = model.encode(query, convert_to_tensor=True) | |
| search_hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5) | |
| results = [] | |
| for hit in search_hits[0]: | |
| text = " ".join(data.iloc[hit['corpus_id']]['tokens']) | |
| results.append(f"Score: {hit['score']:.4f} - Text: {text}") | |
| return "\n".join(results) | |
| # New function to transcribe audio using Whisper API | |
| def transcribe_audio(audio_file): | |
| audio_bytes = audio_file.read() | |
| response = openai.Audio.transcribe("whisper-1", audio_bytes) | |
| return response['text'] | |
| # Modified interface to include audio input | |
| iface = gr.Interface( | |
| fn=semantic_search, | |
| inputs=["text", "file"], # Add audio file input | |
| outputs="text", | |
| title="語義搜索應用", | |
| description="輸入一個查詢或上傳一個音頻文件,然後模型將返回最相似的結果。", | |
| examples=["example_audio.wav"] # Example audio file | |
| ) | |
| # New function to handle both text and audio inputs | |
| def handle_input(input_text, audio_file): | |
| if audio_file is not None: | |
| input_text = transcribe_audio(audio_file) | |
| return semantic_search(input_text) | |
| if __name__ == "__main__": | |
| #iface.launch() | |
| iface.launch(share=True) #網頁跑不出來 | |