NHZ commited on
Commit
b8b3983
·
verified ·
1 Parent(s): 3fd2783

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -9
app.py CHANGED
@@ -64,7 +64,7 @@ def extract_pdf_content(drive_url):
64
  def create_vector_store(text):
65
  # Split the text into sentences and clean it
66
  sentences = [sentence.strip() for sentence in text.split(". ") if sentence.strip()]
67
-
68
  # Use Hugging Face transformer model for embeddings
69
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
70
  tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -73,17 +73,14 @@ def create_vector_store(text):
73
  def embed(sentence):
74
  tokens = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
75
  with torch.no_grad():
76
- embeddings = model(**tokens).last_hidden_state.mean(dim=1).numpy()
77
  return embeddings
78
 
79
- # Generate embeddings for cleaned sentences
80
- embeddings = [embed(sentence)[0] for sentence in sentences]
81
-
82
- # Convert embeddings to a numpy array
83
- embeddings = np.array(embeddings, dtype=np.float32)
84
-
85
  # Create a FAISS vector store with sentences and their embeddings
86
- vector_store = FAISS.from_embeddings(embeddings=embeddings, texts=sentences)
 
 
 
87
  return vector_store, sentences
88
 
89
  # Streamlit app
 
64
  def create_vector_store(text):
65
  # Split the text into sentences and clean it
66
  sentences = [sentence.strip() for sentence in text.split(". ") if sentence.strip()]
67
+
68
  # Use Hugging Face transformer model for embeddings
69
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
70
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
73
  def embed(sentence):
74
  tokens = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
75
  with torch.no_grad():
76
+ embeddings = model(**tokens).last_hidden_state.mean(dim=1).squeeze().numpy()
77
  return embeddings
78
 
 
 
 
 
 
 
79
  # Create a FAISS vector store with sentences and their embeddings
80
+ vector_store = FAISS.from_texts(
81
+ texts=sentences,
82
+ embedding_function=embed
83
+ )
84
  return vector_store, sentences
85
 
86
  # Streamlit app