tst
Browse files- app.py +47 -0
- requirements.txt +7 -0
app.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#document q&a app to run on hugging face space (not for automatic speech recognition)
|
| 2 |
+
|
| 3 |
+
import streamlit as st
|
| 4 |
+
import torch
|
| 5 |
+
from transformers import AutoModelForCTC
|
| 6 |
+
from transformers import AutoProcessor
|
| 7 |
+
import faiss
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
# Load text embeddings model (https://huggingface.co/Salesforce/SFR-Embedding-Mistral) using HF API key from environment variable "HF_KEY"
|
| 11 |
+
embeddings_model = AutoModelForCTC.from_pretrained("Salesforce/SFR-Embedding-Mistral")
|
| 12 |
+
processor = AutoProcessor.from_pretrained("Salesforce/SFR-Embedding-Mistral")
|
| 13 |
+
|
| 14 |
+
# Use streamlit to select one or more files (documents like pdf, word or excel)
|
| 15 |
+
uploaded_files = st.file_uploader("Choose a file", accept_multiple_files=True)
|
| 16 |
+
|
| 17 |
+
# Create an index for storing the embeddings
|
| 18 |
+
index = faiss.IndexFlatL2(768) # Assuming the embeddings have a dimension of 768
|
| 19 |
+
|
| 20 |
+
# Implement code to embed text from selected files in vector database using the text embeddings model
|
| 21 |
+
success = True # Assume success by default
|
| 22 |
+
|
| 23 |
+
for file in uploaded_files:
|
| 24 |
+
# Read the content of the file
|
| 25 |
+
text = file.read().decode("utf-8")
|
| 26 |
+
|
| 27 |
+
# Tokenize the text
|
| 28 |
+
inputs = processor(text, return_tensors="pt", padding="max_length", truncation=True)
|
| 29 |
+
|
| 30 |
+
# Get the embeddings
|
| 31 |
+
with torch.no_grad():
|
| 32 |
+
embeddings = embeddings_model(**inputs).last_hidden_state.mean(dim=1)
|
| 33 |
+
# Add the embeddings to the index
|
| 34 |
+
try:
|
| 35 |
+
index.add(embeddings.numpy())
|
| 36 |
+
except Exception as e:
|
| 37 |
+
success = False # Set success to False if an exception occurs
|
| 38 |
+
st.write(f"Failed to add embeddings to the index: {e}")
|
| 39 |
+
break
|
| 40 |
+
|
| 41 |
+
if success:
|
| 42 |
+
st.write("Embeddings added to the index successfully")
|
| 43 |
+
else:
|
| 44 |
+
st.write("Operation failed")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
torch
|
| 3 |
+
transformers
|
| 4 |
+
librosa
|
| 5 |
+
numpy
|
| 6 |
+
soundfile
|
| 7 |
+
faiss
|