Delete selectBioModels.py
Browse files- selectBioModels.py +0 -81
selectBioModels.py
DELETED
|
@@ -1,81 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import re
|
| 3 |
-
import pandas as pd
|
| 4 |
-
import shutil
|
| 5 |
-
|
| 6 |
-
# Function to search BioModels and create the CSV file
|
| 7 |
-
def search_biomodels(directory, keywords, output_file):
|
| 8 |
-
biomodel_numbers_list = []
|
| 9 |
-
matching_biomodels = []
|
| 10 |
-
|
| 11 |
-
files = os.listdir(directory)
|
| 12 |
-
|
| 13 |
-
for file in files:
|
| 14 |
-
file_path = os.path.join(directory, file)
|
| 15 |
-
|
| 16 |
-
try:
|
| 17 |
-
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 18 |
-
file_content = f.read()
|
| 19 |
-
|
| 20 |
-
# Find all biomodel numbers using a more flexible regex
|
| 21 |
-
biomodel_numbers = re.findall(r'biomodels\.db/(\w+)', file_content)
|
| 22 |
-
|
| 23 |
-
# Search for the biomodel name, case-insensitive, and allow variations
|
| 24 |
-
biomodel_name_match = re.search(rf'{re.escape(keywords[0])} is "([^"]+)"', file_content, re.IGNORECASE)
|
| 25 |
-
biomodel_name = biomodel_name_match.group(1) if biomodel_name_match else ''
|
| 26 |
-
|
| 27 |
-
def matches_keywords(name, keywords):
|
| 28 |
-
# Check for any keyword match in the biomodel name, case-insensitive
|
| 29 |
-
return any(keyword.lower() in name.lower() for keyword in keywords)
|
| 30 |
-
|
| 31 |
-
# If a matching biomodel name is found, save it
|
| 32 |
-
if biomodel_name and matches_keywords(biomodel_name, keywords):
|
| 33 |
-
biomodel_numbers_list.extend(biomodel_numbers)
|
| 34 |
-
matching_biomodels.extend([biomodel_name] * len(biomodel_numbers))
|
| 35 |
-
|
| 36 |
-
except Exception as e:
|
| 37 |
-
print(f"Error processing file {file_path}: {e}")
|
| 38 |
-
|
| 39 |
-
# Create a DataFrame from the collected data
|
| 40 |
-
df = pd.DataFrame({
|
| 41 |
-
'Biomodel Number': biomodel_numbers_list,
|
| 42 |
-
'Biomodel Name': [matching_biomodels[i] if i < len(matching_biomodels) else '' for i in range(len(biomodel_numbers_list))]
|
| 43 |
-
})
|
| 44 |
-
|
| 45 |
-
# Save the DataFrame to a CSV file
|
| 46 |
-
df.to_csv(output_file, index=False)
|
| 47 |
-
print(f"Data saved to {output_file}")
|
| 48 |
-
|
| 49 |
-
# Function to copy matching files to final_models directory
|
| 50 |
-
def copy_matching_files(csv_file, data_folder, final_models_folder):
|
| 51 |
-
# Create the final_models folder if it doesn't exist
|
| 52 |
-
os.makedirs(final_models_folder, exist_ok=True)
|
| 53 |
-
|
| 54 |
-
# Load the CSV file into a DataFrame
|
| 55 |
-
df = pd.read_csv(csv_file)
|
| 56 |
-
|
| 57 |
-
# Iterate through the data folder to find and copy matching files
|
| 58 |
-
for root, dirs, files in os.walk(data_folder):
|
| 59 |
-
for file in files:
|
| 60 |
-
file_path = os.path.join(root, file)
|
| 61 |
-
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 62 |
-
content = f.read()
|
| 63 |
-
# Check if any biomodel name or number is in the file
|
| 64 |
-
for i, row in df.iterrows():
|
| 65 |
-
biomodel_number = row['Biomodel Number']
|
| 66 |
-
biomodel_name = row['Biomodel Name']
|
| 67 |
-
if (biomodel_name and biomodel_name.lower() in content.lower()) or biomodel_number in content:
|
| 68 |
-
shutil.copy(file_path, final_models_folder)
|
| 69 |
-
print(f"Copied: {file} to final_models")
|
| 70 |
-
|
| 71 |
-
print(f"All matching biomodel files have been copied to {final_models_folder}")
|
| 72 |
-
|
| 73 |
-
# Main execution
|
| 74 |
-
directory = r'C:\Users\navan\Downloads\BioModelsRAG\BioModelsRAG\data'
|
| 75 |
-
output_file = r'C:\Users\navan\Downloads\BioModelsRAG\biomodels_output.csv'
|
| 76 |
-
final_models_folder = r'C:\Users\navan\Downloads\BioModelsRAG\final_models'
|
| 77 |
-
user_keywords = input("Keyword you would like to search for: ").split()
|
| 78 |
-
|
| 79 |
-
# Search and copy files
|
| 80 |
-
search_biomodels(directory, user_keywords, output_file)
|
| 81 |
-
copy_matching_files(output_file, directory, final_models_folder)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|