Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import os | |
| # Function to create a vector store | |
| def fn_create_vector_store(lv_file_name, lv_domain): | |
| """ | |
| Create a vector store by encoding and storing embeddings of column descriptions from an Excel file. | |
| Args: | |
| lv_file_name (str): The path to the Excel file. | |
| lv_domain (str): The domain name. | |
| Returns: | |
| str: A message indicating the status of the operation. | |
| """ | |
| # File Names | |
| lv_faiss_file_name = 'db/'+lv_domain+'_index.faiss' | |
| lv_rowdata_file_name = 'db/'+lv_domain+'_row_mapping.parquet' | |
| # Check if files exist | |
| if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_rowdata_file_name): | |
| return "Data Already Exist" | |
| else: | |
| try: | |
| # Load the pre-trained model | |
| lv_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # Excel to Dataframe | |
| lv_excel_data = pd.read_excel(lv_file_name,sheet_name=None) | |
| # Dictionary to store Embeddings, Faiss Index, and Index to Row Mapping | |
| lv_embeddings_list = [] | |
| lv_row_mapping = [] | |
| # Reading each sheet | |
| for lv_sheet_name, lv_sheet_data in lv_excel_data.items(): | |
| # Creating Embeddings | |
| # Details available here -> https://www.sbert.net/docs/pretrained_models.html | |
| lv_sheet_data.iloc[:, 1] = lv_sheet_data.iloc[:, 1].apply(lambda x: str(x).replace(u'\xa0', u' ')) | |
| lv_column_descriptions = lv_sheet_data.iloc[:, 1].astype(str).tolist() | |
| lv_embeddings = lv_model.encode(lv_column_descriptions).astype('float32') | |
| lv_embeddings_list.append(lv_embeddings) | |
| # Merging all table, columns, description/hint into table | |
| for i, row in enumerate(lv_sheet_data.itertuples(index=False)): | |
| lv_row_mapping.append({ | |
| 'sheet_name': lv_sheet_name, | |
| 'column_name': row[0], | |
| 'column_description': row[1] | |
| }) | |
| # Combine all embeddings into one array | |
| lv_merged_embeddings_list = np.vstack(lv_embeddings_list) | |
| # Create a Faiss index | |
| lv_dimension = lv_merged_embeddings_list.shape[1] | |
| lv_index = faiss.IndexFlatL2(lv_dimension) | |
| lv_index.add(lv_merged_embeddings_list) | |
| # Saving the Faiss index to a file | |
| faiss.write_index(lv_index, lv_faiss_file_name) | |
| # Saving the Row Data to a file | |
| lv_row_mapping_df = pd.DataFrame(lv_row_mapping) | |
| lv_row_mapping_df.to_parquet(lv_rowdata_file_name,index=False) | |
| return "Record Added Successfully" | |
| except Exception as e: | |
| raise e | |
| def fn_map_data(lv_saved_file_name,lv_file_name,lv_source_domain): | |
| # File Names | |
| lv_faiss_file_name = 'db/'+lv_source_domain+'_index.faiss' | |
| lv_sourcedata_file_name = 'db/'+lv_source_domain+'_row_mapping.parquet' | |
| lv_mapping_file_name = 'db/'+lv_source_domain+"_"+lv_file_name | |
| # Loading Data | |
| if os.path.exists(lv_faiss_file_name) and os.path.exists(lv_sourcedata_file_name): | |
| # Load the pre-trained model | |
| lv_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # Load the Faiss index | |
| lv_index = faiss.read_index(lv_faiss_file_name) | |
| # Load the Row Data | |
| lv_source_mapping_df = pd.read_parquet(lv_sourcedata_file_name) | |
| lv_source_mapping_df.reindex() | |
| # Excel to Dataframe | |
| lv_excel_data = pd.read_excel(lv_saved_file_name,sheet_name=None) | |
| # New Mapping Dataframe | |
| lv_row_mapping_df = pd.DataFrame(columns=['source_sheet_name','source_column','target_sheet_name','target_column']) | |
| # Reading each sheet | |
| for lv_sheet_name, lv_sheet_data in lv_excel_data.items(): | |
| # Processing each row of the sheet | |
| for i, row in enumerate(lv_sheet_data.itertuples(index=False)): | |
| try: | |
| # Creating Embeddings | |
| # Details available here -> https://www.sbert.net/docs/pretrained_models.html | |
| lv_query = row[1] | |
| lv_query_embedding = lv_model.encode([lv_query]) | |
| # Search for similar vectors | |
| lv_distances, lv_indices = lv_index.search(np.array(lv_query_embedding), 1) | |
| # print("Rahul Rahul") | |
| # print(lv_indices[0][0]) | |
| # Mapped Row | |
| lv_row = lv_source_mapping_df.iloc[[lv_indices[0][0]]] | |
| # print(lv_row['sheet_name']) | |
| # print(lv_row['column_name']) | |
| lv_new_row = { | |
| 'source_sheet_name': lv_row['sheet_name'].values[0], | |
| 'source_column': lv_row['column_name'].values[0], | |
| 'target_sheet_name': lv_sheet_name, | |
| 'target_column': row[0] | |
| } | |
| # Adding to the Dataframe | |
| lv_row_mapping_df = pd.concat([lv_row_mapping_df, pd.DataFrame([lv_new_row])], ignore_index=True) | |
| except Exception as e: | |
| pass | |
| # Saving the Row Data to a file | |
| lv_row_mapping_df.to_excel(lv_mapping_file_name,index=False) | |
| return lv_row_mapping_df.to_json(orient='records') | |
| else: | |
| raise Exception("Source Domain Data Not Found") |