ELHACHYMI commited on
Commit
f2aadac
·
verified ·
1 Parent(s): 508def0

title embed code

Browse files
src/title_embd/__pycache__/embed.cpython-311.pyc ADDED
Binary file (1.05 kB). View file
 
src/title_embd/__pycache__/index.cpython-311.pyc ADDED
Binary file (901 Bytes). View file
 
src/title_embd/__pycache__/preprocessing.cpython-311.pyc ADDED
Binary file (978 Bytes). View file
 
src/title_embd/embed.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from src.configs.config import EMBEDDING_MODEL, TITLE_EMBEDDINGS_FILE
3
+ import numpy as np
4
+
5
+ def generate_title_embeddings(title_data):
6
+ model = SentenceTransformer(EMBEDDING_MODEL)
7
+ titles = [item["title"] for item in title_data]
8
+ embeddings = model.encode(titles, convert_to_tensor=False)
9
+ np.save(TITLE_EMBEDDINGS_FILE, embeddings)
10
+ return embeddings
src/title_embd/index.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import numpy as np
3
+ from src.configs.config import TITLE_EMBEDDINGS_FILE, TITLE_FAISS_INDEX_FILE
4
+
5
+ def create_title_faiss_index():
6
+ embeddings = np.load(TITLE_EMBEDDINGS_FILE)
7
+ dimension = embeddings.shape[1]
8
+ index = faiss.IndexFlatL2(dimension)
9
+ index.add(embeddings)
10
+ faiss.write_index(index, str(TITLE_FAISS_INDEX_FILE))
11
+ return index
src/title_embd/preprocessing.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from src.configs.config import METADATA_FILE
3
+
4
+ def preprocess_titles():
5
+ metadata = pd.read_csv(METADATA_FILE)
6
+ titles = metadata["Nom du document"].tolist()
7
+ links = metadata["Lien"].tolist()
8
+ return [{"title": t, "link": l} for t, l in zip(titles, links)]