Spaces:
Sleeping
Sleeping
title embed code
Browse files
src/title_embd/__pycache__/embed.cpython-311.pyc
ADDED
|
Binary file (1.05 kB). View file
|
|
|
src/title_embd/__pycache__/index.cpython-311.pyc
ADDED
|
Binary file (901 Bytes). View file
|
|
|
src/title_embd/__pycache__/preprocessing.cpython-311.pyc
ADDED
|
Binary file (978 Bytes). View file
|
|
|
src/title_embd/embed.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
from src.configs.config import EMBEDDING_MODEL, TITLE_EMBEDDINGS_FILE
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
def generate_title_embeddings(title_data):
|
| 6 |
+
model = SentenceTransformer(EMBEDDING_MODEL)
|
| 7 |
+
titles = [item["title"] for item in title_data]
|
| 8 |
+
embeddings = model.encode(titles, convert_to_tensor=False)
|
| 9 |
+
np.save(TITLE_EMBEDDINGS_FILE, embeddings)
|
| 10 |
+
return embeddings
|
src/title_embd/index.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import faiss
|
| 2 |
+
import numpy as np
|
| 3 |
+
from src.configs.config import TITLE_EMBEDDINGS_FILE, TITLE_FAISS_INDEX_FILE
|
| 4 |
+
|
| 5 |
+
def create_title_faiss_index():
|
| 6 |
+
embeddings = np.load(TITLE_EMBEDDINGS_FILE)
|
| 7 |
+
dimension = embeddings.shape[1]
|
| 8 |
+
index = faiss.IndexFlatL2(dimension)
|
| 9 |
+
index.add(embeddings)
|
| 10 |
+
faiss.write_index(index, str(TITLE_FAISS_INDEX_FILE))
|
| 11 |
+
return index
|
src/title_embd/preprocessing.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from src.configs.config import METADATA_FILE
|
| 3 |
+
|
| 4 |
+
def preprocess_titles():
|
| 5 |
+
metadata = pd.read_csv(METADATA_FILE)
|
| 6 |
+
titles = metadata["Nom du document"].tolist()
|
| 7 |
+
links = metadata["Lien"].tolist()
|
| 8 |
+
return [{"title": t, "link": l} for t, l in zip(titles, links)]
|