AI-Searching-System / utils /preprocessing.py
thienphuc12339's picture
Upload 12 files
dddf3f6 verified
import os
from source.text import save_text_file
from source.embedding import text_to_embedding
import numpy as np
def generate_unique_id(directory="raw_text"):
"""Generate a sequential ID based on the number of files in the specified directory."""
if not os.path.exists(directory):
os.makedirs(directory) # Ensure the directory exists
# List all files in the directory and count them
files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
next_id = len(files) + 1
# Format the ID as a two-digit string, e.g., "01", "02", ...
formatted_id = f"{next_id:02d}"
return formatted_id
def process_mentor_profile(profile_text):
# Generate a unique ID for the profile using the sequential ID generation method
profile_id = generate_unique_id()
# Define directories
raw_text_directory = "raw_text"
embedding_directory = "embedded_text"
# Ensure directories exist
if not os.path.exists(raw_text_directory):
os.makedirs(raw_text_directory)
if not os.path.exists(embedding_directory):
os.makedirs(embedding_directory)
# Save profile text to a .txt file with the sequential ID
txt_filename = f"{profile_id}.txt"
save_text_file(txt_filename, profile_text, directory=raw_text_directory)
# Generate embedding from the text
embedding = text_to_embedding(profile_text)
# Save the embedding to a .npy file, using the same sequential ID
npy_filename = f"{profile_id}.npy"
np.save(os.path.join(embedding_directory, npy_filename), embedding)
return profile_id