|
|
import os
|
|
|
import pandas as pd
|
|
|
from sentence_transformers import SentenceTransformer
|
|
|
import numpy as np
|
|
|
from tqdm import tqdm
|
|
|
import csv
|
|
|
import faiss
|
|
|
import requests
|
|
|
|
|
|
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
|
|
|
|
|
|
|
|
|
base_url = "https://huggingface.co/datasets/manhteky123/LawVietnamese/resolve/main/"
|
|
|
data_url = f"{base_url}data.csv"
|
|
|
faiss_index_url = f"{base_url}faiss_index.bin"
|
|
|
vectors_url = f"{base_url}vectors.npy"
|
|
|
|
|
|
|
|
|
def download_to_disk(url, filename):
|
|
|
response = requests.get(url)
|
|
|
if response.status_code == 200:
|
|
|
with open(filename, 'wb') as f:
|
|
|
f.write(response.content)
|
|
|
print(f"Downloaded {url} to {filename}.")
|
|
|
else:
|
|
|
raise Exception(f"Failed to download {url}: {response.status_code}")
|
|
|
|
|
|
|
|
|
data_file_path = 'data.csv'
|
|
|
faiss_index_file_path = 'faiss_index.bin'
|
|
|
vectors_file_path = 'vectors.npy'
|
|
|
|
|
|
download_to_disk(data_url, data_file_path)
|
|
|
download_to_disk(faiss_index_url, faiss_index_file_path)
|
|
|
download_to_disk(vectors_url, vectors_file_path)
|
|
|
|
|
|
|
|
|
with open(data_file_path, 'r', encoding='utf-8') as file:
|
|
|
reader = csv.reader(file)
|
|
|
next(reader)
|
|
|
data = [row[0].split(',', 2) for row in reader]
|
|
|
df = pd.DataFrame(data, columns=['so_hieu', 'dieu', 'truncated_text'])
|
|
|
|
|
|
print(df.head())
|
|
|
|
|
|
|
|
|
column_name = 'truncated_text'
|
|
|
|
|
|
|
|
|
column_name = 'truncated_text'
|
|
|
|
|
|
|
|
|
model = SentenceTransformer('intfloat/multilingual-e5-small')
|
|
|
|
|
|
|
|
|
index = faiss.read_index(faiss_index_file_path)
|
|
|
|
|
|
|
|
|
vectors = np.load(vectors_file_path)
|
|
|
|
|
|
def retrieve_documents(query, k=5, threshold=0.7):
|
|
|
query_vector = model.encode([query], convert_to_tensor=True).cpu().numpy()
|
|
|
D, I = index.search(query_vector, k)
|
|
|
similarities = 1 / (1 + D[0])
|
|
|
filtered_documents = []
|
|
|
for i, similarity in enumerate(similarities):
|
|
|
if similarity >= threshold:
|
|
|
filtered_documents.append(df.iloc[I[0][i]][column_name])
|
|
|
return filtered_documents
|
|
|
|