Spaces:
Paused
Paused
| import os | |
| import sys | |
| import pandas as pd | |
| def display_and_store_directory_content(base_path): | |
| """ | |
| Display all paths with directories and files along with their content, | |
| and store the information in a Pandas DataFrame. | |
| Args: | |
| base_path (str): The root directory path to scan. | |
| Returns: | |
| None: Prints paths and content, and saves the DataFrame as a pickle file. | |
| """ | |
| data = [] # To store path and content as rows for the DataFrame | |
| for root, dirs, files in os.walk(base_path): | |
| # Store directories (no content) | |
| for d in dirs: | |
| dir_path = os.path.join(root, d) | |
| data.append({"path": dir_path, "content": ""}) | |
| print(f"Directory: {dir_path}") | |
| # Store files and their content | |
| for f in files: | |
| file_path = os.path.join(root, f) | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| content = file.read() | |
| except Exception as e: | |
| content = f"Error reading file: {e}" | |
| data.append({"path": file_path, "content": content}) | |
| print(f"\nFile: {file_path}") | |
| print("-" * 40) | |
| print(content) | |
| print("-" * 40) | |
| # Create a DataFrame | |
| df = pd.DataFrame(data) | |
| # Create the 'extraction' directory if it doesn't exist | |
| extraction_dir = "extraction" | |
| if not os.path.exists(extraction_dir): | |
| os.makedirs(extraction_dir) | |
| # Use the last component of the base path as the file name | |
| base_name = os.path.basename(os.path.normpath(base_path)) | |
| output_file = os.path.join(extraction_dir, f"{base_name}.pkl") | |
| # Save the DataFrame to a pickle file | |
| df.to_pickle(output_file) | |
| print(f"\nDataFrame saved to {output_file}") | |
| if __name__ == "__main__": | |
| # Ensure a directory path is provided as an argument | |
| if len(sys.argv) < 2: | |
| print("Usage: python utils\\extract_all_content.py <directory>") | |
| sys.exit(1) | |
| # Get the directory path from the command-line arguments | |
| directory_path = sys.argv[1] | |
| # Execute the function | |
| if os.path.exists(directory_path): | |
| display_and_store_directory_content(directory_path) | |
| else: | |
| print(f"Error: The path '{directory_path}' does not exist.") | |