Spaces:
Runtime error
Runtime error
| # MUSS AUFGERÄUMT WERDEN | |
| import json | |
| import os | |
| import subprocess | |
| import PyPDF2 | |
| import csv | |
| import fitz # PyMuPDF | |
| def extract_text_from_pdf(pdf_path): | |
| """ | |
| Extracts all text from a PDF file. | |
| :param pdf_path: Path to the PDF file. | |
| :return: Extracted text as a string. | |
| """ | |
| # Open the PDF file | |
| doc = fitz.open(pdf_path) | |
| # Initialize an empty string to hold the text | |
| text = '' | |
| # Iterate through each page in the PDF | |
| for page_num in range(len(doc)): | |
| # Get a page | |
| page = doc.load_page(page_num) | |
| # Extract text from the page and add it to the result | |
| text += page.get_text() | |
| # Close the document | |
| doc.close() | |
| return text | |
| def read_pdfs_from_folder(folder_path): | |
| """ | |
| Reads all PDF files in the specified folder using PdfReader and extracts their text. | |
| Parameters: | |
| - folder_path: The path to the folder containing PDF files. | |
| Returns: | |
| - A dictionary with file names as keys and their extracted text as values. | |
| """ | |
| pdf_texts = {} | |
| for filename in os.listdir(folder_path): | |
| if filename.endswith('.pdf'): | |
| file_path = os.path.join(folder_path, filename) | |
| with open(file_path, 'rb') as pdf_file: | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| text = '' | |
| for page in pdf_reader.pages: | |
| try: | |
| text += page.extract_text() | |
| except UnicodeDecodeError as e: | |
| print(e) | |
| for c in text: | |
| if c in ["ä", "Ä"]: | |
| text = text[:text.index(c)] + "ae" + text[text.index(c)+1:] | |
| if c in ["ö", "Ö"]: | |
| text = text[:text.index(c)] + "oe" + text[text.index(c)+1:] | |
| if c in ["ü", "Ü"]: | |
| text = text[:text.index(c)] + "ue" + text[text.index(c)+1:] | |
| if c in [",", ";", "\\", '"']: | |
| text = text[:text.index(c)] + "_" + text[text.index(c)+1:] | |
| if c in ["/n", "\n"]: | |
| text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:] | |
| pdf_texts[filename] = text | |
| return pdf_texts | |
| def read_csv_lines_as_strings(filename): | |
| """ | |
| Opens a CSV file and returns each line as a string in a list. | |
| Parameters: | |
| - filename: The path to the CSV file. | |
| Returns: | |
| - A list of strings, each representing a line from the CSV file. | |
| """ | |
| lines_as_strings = [] | |
| with open(filename, newline='') as csvfile: | |
| try: | |
| reader = csv.reader(csvfile) | |
| for row in reader: | |
| # Convert the row (a list of values) back into a comma-separated string | |
| line_as_string = ','.join(row) | |
| lines_as_strings.append(line_as_string) | |
| except UnicodeDecodeError as e: | |
| print(e) | |
| return lines_as_strings | |
| # Function to load data from JSON files | |
| def load_data(filename): | |
| with open(filename, 'r') as file: | |
| try: | |
| return json.load(file) | |
| except UnicodeDecodeError as err: | |
| print(err) | |
| return {} | |
| def find_and_open_file(filename, start_directory): | |
| """ | |
| Attempts to open a file with the given filename starting from the specified directory. | |
| If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows. | |
| """ | |
| for root, dirs, files in os.walk(start_directory): | |
| if filename in files: | |
| filepath = os.path.join(root, filename) | |
| print(f"File found: {filepath}") | |
| return filepath | |
| print(f"File {filename} not found.") | |
| return None | |
| def open_file(filepath): | |
| """ | |
| Opens the file with the default application, based on the operating system. | |
| """ | |
| if os.path.exists(filepath): | |
| if os.name == 'posix': # Linux, macOS, etc. | |
| subprocess.call(('open', filepath)) | |
| elif os.name == 'nt': # Windows | |
| os.startfile(filepath) | |
| else: | |
| print(f"Cannot open file on this operating system: {filepath}") | |
| else: | |
| print(f"File does not exist: {filepath}") | |
| def list_folders_files_recursive(path, depth=0): | |
| """ | |
| Recursively lists all folders and files within the specified path, including subfolders. | |
| Parameters: | |
| - path: The directory path to list contents from. | |
| - depth: The current depth of recursion (used for indentation in print statements). | |
| Returns: | |
| - None | |
| """ | |
| # Ensure the provided path is a directory | |
| if not os.path.isdir(path): | |
| print(f"The provided path '{path}' is not a valid directory.") | |
| return | |
| indent = ' ' * depth # Indentation based on recursion depth | |
| folders, files = [], [] | |
| # List all entries in the directory | |
| for entry in os.listdir(path): | |
| full_path = os.path.join(path, entry) | |
| if os.path.isdir(full_path): | |
| folders.append(entry) | |
| print(f"{indent}Folder: {entry}") | |
| # Recursively list subfolders and files | |
| list_folders_files_recursive(full_path, depth + 1) | |
| elif os.path.isfile(full_path): | |
| files.append(entry) | |
| for f in files: | |
| print(f"{indent}File: {f}") | |
| def list_folders_files(path): | |
| """ | |
| Lists all folders and files within the specified path. | |
| Parameters: | |
| - path: The directory path to list contents from. | |
| Returns: | |
| - A tuple of two lists: (folders, files). | |
| """ | |
| folders = [] | |
| files = [] | |
| # Ensure the provided path is a directory | |
| if not os.path.isdir(path): | |
| print(f"The provided path '{path}' is not a valid directory.") | |
| return folders, files | |
| # List all entries in the directory | |
| for entry in os.listdir(path): | |
| full_path = os.path.join(path, entry) | |
| if os.path.isdir(full_path): | |
| folders.append(entry) | |
| elif os.path.isfile(full_path): | |
| files.append(entry) | |
| return folders, files | |
| if __name__ == "__main__": | |
| print("here are all functions that read files") | |