Spaces:

kamkol
/

AB_AI_RAG_Agent

Runtime error

App Files Files Community

kamkol commited on May 12, 2025

Commit

cf0b4fb

1 Parent(s): abe7dd0

Directly calculate embedding similarity

Browse files

Files changed (3) hide show

app/app.py +35 -5
process_data.py +43 -4
requirements.txt +3 -1

app/app.py CHANGED Viewed

@@ -16,7 +16,9 @@ from langchain_core.tools import tool
 from langchain_openai import ChatOpenAI
 from langchain_community.tools.arxiv.tool import ArxivQueryRun
 from langchain.schema.output_parser import StrOutputParser
-from sentence_transformers import SentenceTransformer
 from langchain_core.vectorstores import VectorStore
 from langchain_core.documents import Document
 from langgraph.graph import StateGraph, END
@@ -105,9 +107,10 @@ def find_processed_data():
     """Find the processed_data directory path"""
     # Check common locations
     possible_paths = [
         "data/processed_data",
         "app/data/processed_data",
-        "/data/processed_data"
     ]
     for path in possible_paths:
@@ -122,6 +125,32 @@ def find_processed_data():
     raise FileNotFoundError("Could not find processed_data directory")
 # Initialize the vectorstore
 @st.cache_resource
 def initialize_vectorstore():
@@ -146,11 +175,12 @@ def initialize_vectorstore():
         except Exception as e:
             embedded_docs = []
             raise RuntimeError(f"Error loading embedded_docs.pkl: {str(e)}")
-        # Initialize embedding model - use SentenceTransformer directly
         model_name = "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
         try:
-            embedding_model = SentenceTransformer(model_name)
         except Exception as e:
             print(f"Error loading model: {str(e)}")
             raise RuntimeError(f"Error initializing SentenceTransformer model: {str(e)}")

 from langchain_openai import ChatOpenAI
 from langchain_community.tools.arxiv.tool import ArxivQueryRun
 from langchain.schema.output_parser import StrOutputParser
+from transformers import AutoModel, AutoTokenizer
+import torch
+import torch.nn.functional as F
 from langchain_core.vectorstores import VectorStore
 from langchain_core.documents import Document
 from langgraph.graph import StateGraph, END
     """Find the processed_data directory path"""
     # Check common locations
     possible_paths = [
+        "/data/processed_data",
         "data/processed_data",
         "app/data/processed_data",
+        "/app/data/processed_data"
     ]
     for path in possible_paths:
     raise FileNotFoundError("Could not find processed_data directory")
+class ArcticEmbedder:
+    def __init__(self, model_name):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+    def _mean_pooling(self, model_output, attention_mask):
+        token_embeddings = model_output.last_hidden_state
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def encode(self, query):
+        encoded_input = self.tokenizer(
+            [query],
+            padding=True,
+            truncation=True,
+            return_tensors="pt"
+        ).to(self.device)
+        with torch.no_grad():
+            model_output = self.model(**encoded_input)
+        embeddings = self._mean_pooling(model_output, encoded_input['attention_mask'])
+        return F.normalize(embeddings, p=2, dim=1).cpu().numpy().flatten().tolist()
 # Initialize the vectorstore
 @st.cache_resource
 def initialize_vectorstore():
         except Exception as e:
             embedded_docs = []
             raise RuntimeError(f"Error loading embedded_docs.pkl: {str(e)}")
+        # Initialize custom embedding model
         model_name = "kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec"
         try:
+            embedding_model = ArcticEmbedder(model_name)
         except Exception as e:
             print(f"Error loading model: {str(e)}")
             raise RuntimeError(f"Error initializing SentenceTransformer model: {str(e)}")

process_data.py CHANGED Viewed

@@ -12,7 +12,9 @@ from langchain_community.document_loaders import DirectoryLoader
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_core.documents import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from sentence_transformers import SentenceTransformer
 from langchain_community.vectorstores import Qdrant
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams
@@ -58,6 +60,40 @@ def clean_directory(directory_path):
     path.mkdir(parents=True, exist_ok=True)
     print(f"Created clean directory: {directory_path}")
 def process_pdfs():
     """Process PDFs and create vectorstore"""
     print("Processing PDFs...")
@@ -150,10 +186,13 @@ def process_pdfs():
     with open(processed_data_dir / "chunks.pkl", "wb") as f:
         pickle.dump(split_chunks, f)
-    # Initialize embedding model using SentenceTransformer directly
     try:
-        embedding_model = SentenceTransformer("kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec")
-        print("Successfully loaded SentenceTransformer model")
     except Exception as e:
         print(f"Error loading model: {str(e)}")
         raise RuntimeError(f"Error initializing SentenceTransformer model: {str(e)}")

 from langchain_community.document_loaders import PyPDFLoader
 from langchain_core.documents import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from transformers import AutoModel, AutoTokenizer
+import torch
+import torch.nn.functional as F
 from langchain_community.vectorstores import Qdrant
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, VectorParams
     path.mkdir(parents=True, exist_ok=True)
     print(f"Created clean directory: {directory_path}")
+class ArcticEmbedder:
+    def __init__(self, model_name):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+    def _mean_pooling(self, model_output, attention_mask):
+        token_embeddings = model_output.last_hidden_state
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def encode(self, texts, batch_size=32):
+        all_embeddings = []
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i+batch_size]
+            encoded_input = self.tokenizer(
+                batch,
+                padding=True,
+                truncation=True,
+                return_tensors="pt"
+            ).to(self.device)
+            with torch.no_grad():
+                model_output = self.model(**encoded_input)
+            batch_embeddings = self._mean_pooling(model_output, encoded_input['attention_mask'])
+            batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
+            all_embeddings.append(batch_embeddings.cpu().numpy())
+        return np.concatenate(all_embeddings)
 def process_pdfs():
     """Process PDFs and create vectorstore"""
     print("Processing PDFs...")
     with open(processed_data_dir / "chunks.pkl", "wb") as f:
         pickle.dump(split_chunks, f)
+    # Initialize custom embedding model
     try:
+        embedding_model = ArcticEmbedder("kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec")
+        print("Successfully loaded ArcticEmbedder model")
     except Exception as e:
         print(f"Error loading model: {str(e)}")
         raise RuntimeError(f"Error initializing SentenceTransformer model: {str(e)}")

requirements.txt CHANGED Viewed

@@ -10,4 +10,6 @@ tiktoken>=0.6.0
 python-dotenv>=1.0.1
 qdrant-client>=1.7.0
 scipy>=1.10.0
-sentence-transformers==2.3.0

 python-dotenv>=1.0.1
 qdrant-client>=1.7.0
 scipy>=1.10.0
+sentence-transformers==2.3.0
+transformers>=4.51.3
+torch>=2.0.1