Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import faiss
|
|
| 5 |
from PyPDF2 import PdfReader
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
from transformers import AutoTokenizer, AutoModel
|
| 8 |
-
import torch
|
| 9 |
from langchain.vectorstores import FAISS
|
| 10 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 11 |
from langchain.chains import RetrievalQA
|
|
@@ -63,7 +63,8 @@ def extract_pdf_content(drive_url):
|
|
| 63 |
|
| 64 |
# Function to create a FAISS vector store from the document content
|
| 65 |
def create_vector_store(text):
|
| 66 |
-
|
|
|
|
| 67 |
|
| 68 |
# Use Hugging Face transformer model for embeddings
|
| 69 |
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
|
@@ -72,11 +73,14 @@ def create_vector_store(text):
|
|
| 72 |
|
| 73 |
def embed(sentence):
|
| 74 |
tokens = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
|
| 75 |
-
with torch.no_grad():
|
| 76 |
embeddings = model(**tokens).last_hidden_state.mean(dim=1).numpy()
|
| 77 |
return embeddings
|
| 78 |
|
|
|
|
| 79 |
embeddings = [embed(sentence)[0] for sentence in sentences]
|
|
|
|
|
|
|
| 80 |
vector_store = FAISS.from_embeddings(sentences, embeddings)
|
| 81 |
return vector_store, sentences
|
| 82 |
|
|
|
|
| 5 |
from PyPDF2 import PdfReader
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
from transformers import AutoTokenizer, AutoModel
|
| 8 |
+
import torch
|
| 9 |
from langchain.vectorstores import FAISS
|
| 10 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 11 |
from langchain.chains import RetrievalQA
|
|
|
|
| 63 |
|
| 64 |
# Function to create a FAISS vector store from the document content
|
| 65 |
def create_vector_store(text):
|
| 66 |
+
# Split the text into sentences and clean it
|
| 67 |
+
sentences = [sentence.strip() for sentence in text.split(". ") if sentence.strip()]
|
| 68 |
|
| 69 |
# Use Hugging Face transformer model for embeddings
|
| 70 |
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
|
|
|
| 73 |
|
| 74 |
def embed(sentence):
|
| 75 |
tokens = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
|
| 76 |
+
with torch.no_grad():
|
| 77 |
embeddings = model(**tokens).last_hidden_state.mean(dim=1).numpy()
|
| 78 |
return embeddings
|
| 79 |
|
| 80 |
+
# Generate embeddings for cleaned sentences
|
| 81 |
embeddings = [embed(sentence)[0] for sentence in sentences]
|
| 82 |
+
|
| 83 |
+
# Create a FAISS vector store with valid embeddings
|
| 84 |
vector_store = FAISS.from_embeddings(sentences, embeddings)
|
| 85 |
return vector_store, sentences
|
| 86 |
|