findthehead commited on
Commit
5a3850e
·
1 Parent(s): c88d20c

Use Mistral embeddings to match original index

Browse files
Files changed (2) hide show
  1. app.py +23 -25
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import gradio as gr
 
3
  from huggingface_hub import InferenceClient
4
  from langchain_core.embeddings import Embeddings
5
  from langchain_pinecone import PineconeVectorStore
@@ -13,36 +14,33 @@ from pinecone import Pinecone
13
  DEFAULT_MODEL = "Qwen/Qwen2.5-72B-Instruct"
14
 
15
 
16
- class HFEmbeddings(Embeddings):
17
- """Custom embeddings class using HuggingFace Inference API"""
18
 
19
- # Using BGE-large which produces 1024-dimensional embeddings to match Pinecone index
20
- def __init__(self, model_name: str = "BAAI/bge-large-en-v1.5"):
21
- self.model_name = model_name
22
- self.client = InferenceClient(token=os.getenv("HF_TOKEN"))
23
 
24
- def _to_float_list(self, result) -> list[float]:
25
- """Convert numpy arrays or nested lists to plain Python floats"""
26
- import numpy as np
27
- if hasattr(result, 'tolist'):
28
- return result.tolist()
29
- if isinstance(result, (list, tuple)):
30
- # Handle nested structure - flatten if needed
31
- if len(result) > 0 and isinstance(result[0], (list, tuple, np.ndarray)):
32
- result = result[0]
33
- return [float(x) for x in result]
34
- return [float(x) for x in result]
 
 
35
 
36
  def embed_documents(self, texts: list[str]) -> list[list[float]]:
37
- embeddings = []
38
- for text in texts:
39
- result = self.client.feature_extraction(text, model=self.model_name)
40
- embeddings.append(self._to_float_list(result))
41
- return embeddings
42
 
43
  def embed_query(self, text: str) -> list[float]:
44
- result = self.client.feature_extraction(text, model=self.model_name)
45
- return self._to_float_list(result)
46
 
47
 
48
  class ResearchParrot:
@@ -54,7 +52,7 @@ class ResearchParrot:
54
 
55
  def embeddings(self):
56
  if self._embeddings is None:
57
- self._embeddings = HFEmbeddings()
58
  return self._embeddings
59
 
60
  def vectorstore(self):
 
1
  import os
2
  import gradio as gr
3
+ import requests
4
  from huggingface_hub import InferenceClient
5
  from langchain_core.embeddings import Embeddings
6
  from langchain_pinecone import PineconeVectorStore
 
14
  DEFAULT_MODEL = "Qwen/Qwen2.5-72B-Instruct"
15
 
16
 
17
+ class MistralEmbeddings(Embeddings):
18
+ """Mistral embeddings to match the original index"""
19
 
20
+ def __init__(self):
21
+ self.api_key = os.getenv("MISTRAL_API_KEY")
22
+ self.model = "mistral-embed"
23
+ self.url = "https://api.mistral.ai/v1/embeddings"
24
 
25
+ def _get_embeddings(self, texts: list[str]) -> list[list[float]]:
26
+ headers = {
27
+ "Authorization": f"Bearer {self.api_key}",
28
+ "Content-Type": "application/json"
29
+ }
30
+ data = {
31
+ "model": self.model,
32
+ "input": texts
33
+ }
34
+ response = requests.post(self.url, headers=headers, json=data)
35
+ response.raise_for_status()
36
+ result = response.json()
37
+ return [item["embedding"] for item in result["data"]]
38
 
39
  def embed_documents(self, texts: list[str]) -> list[list[float]]:
40
+ return self._get_embeddings(texts)
 
 
 
 
41
 
42
  def embed_query(self, text: str) -> list[float]:
43
+ return self._get_embeddings([text])[0]
 
44
 
45
 
46
  class ResearchParrot:
 
52
 
53
  def embeddings(self):
54
  if self._embeddings is None:
55
+ self._embeddings = MistralEmbeddings()
56
  return self._embeddings
57
 
58
  def vectorstore(self):
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  gradio>=4.0.0
2
  huggingface_hub>=0.20.0
3
  langchain>=0.1.0
4
- langchain-community>=0.2.0
5
  langchain-pinecone>=0.1.0
6
  pinecone-client>=3.0.0
 
 
1
  gradio>=4.0.0
2
  huggingface_hub>=0.20.0
3
  langchain>=0.1.0
4
+ langchain-core>=0.1.0
5
  langchain-pinecone>=0.1.0
6
  pinecone-client>=3.0.0
7
+ requests>=2.28.0