import ocr import os import threading import concurrent.futures from multiprocessing import Pool # Import for multiprocessing import re from docx import Document # Assuming DOCX support is desired from pdfminer.high_level import extract_text # Import for PDF text extraction import time import pdftoimage import docxtoimage # Additional libraries for new file types #import openpyxl # For basic XLSX handling (consider pandas for structured data) #from pptx import Presentation # For PPTX presentations (install with: pip install python-pptx) try: from docx import Document except ImportError: print("To enable DOCX support, install python-docx: pip install python-docx") class DecodingError(Exception): pass def compile_keywords(categories_keywords_dict): """Pre-compiles keyword lists for faster matching""" compiled_keywords = {category: [re.compile(keyword, re.IGNORECASE) for keyword in keywords] for category, keywords in categories_keywords_dict.items()} return compiled_keywords def categorize_text_chunk(text_chunk, compiled_keywords): """Categorizes a chunk of text using compiled keywords""" for category, keyword_list in compiled_keywords.items(): if all(keyword.search(text_chunk) for keyword in keyword_list): return category return 'Uncategorized' def use_ocr(folder_path): #pag tinawag toh extract niya lahat ng text sa buffer folder all_extracted_text = "" for filename in os.listdir(folder_path): if filename.endswith(".jpg") or filename.endswith(".png"): image_path = os.path.join(folder_path, filename) extracted_text = ocr.extract_text_from_image(image_path) all_extracted_text += "\n".join(extracted_text) + "\n\n" # Add double newlines for separation return all_extracted_text def convert_pages(folder_path, output_format ,max_pages): for root, directories, files in os.walk(folder_path): for filename in files: # Get the file extension (including the dot) extension = os.path.splitext(filename)[1].lower() if extension=='.pdf': pdftoimage.convert_pdfs(folder_path, output_format,max_pages) if extension=='.docx': docxtoimage.process(folder_path,max_pages) def categorize_file(file_path, compiled_keywords): try: if file_path.endswith('.pdf'): text = extract_text(file_path) # Use pdfminer to extract text (CPU-bound) return file_path, categorize_text_chunk(text, compiled_keywords) elif file_path.endswith('.docx') and Document: # ... (code for DOCX files - potentially I/O bound) try: doc = Document(file_path) text = '\n'.join(paragraph.text for paragraph in doc.paragraphs) # Combine all paragraphs return file_path, categorize_text_chunk(text, compiled_keywords) except Exception as e: print(f"Error processing DOCX '{file_path}': {e}") return file_path, 'Uncategorized (Error)' elif file_path.endswith('.txt'): with open(file_path, 'r') as f: text = f.read() return file_path, categorize_text_chunk(text, compiled_keywords) else: print(f"Unsupported file type: {file_path}") return None, 'Unsupported File Type' except Exception as e: print(f"Error processing '{file_path}': {e}") return file_path, 'Uncategorized (Error)' def threaded_worker(file_paths_categories, output_dir): for file_path, category in file_paths_categories: if category is not None: # Skip unsupported files category_dir = os.path.join(output_dir, category) os.makedirs(category_dir, exist_ok=True) os.rename(file_path, os.path.join(category_dir, os.path.basename(file_path))) def multi_process_categorizer(input_dir, output_dir, categories_keywords_dict, num_processes): files = [os.path.join(input_dir, f) for f in os.listdir(input_dir)] # Use multiprocessing pool for CPU-bound text processing with Pool(processes=num_processes) as pool: results = pool.starmap(categorize_file, [(file_path, categories_keywords_dict) for file_path in files]) # Use concurrent.futures for potentially I/O-bound tasks like moving files with concurrent.futures.ThreadPoolExecutor() as executor: executor.submit(threaded_worker, results, output_dir) def chunks(lst, chunk_size): """Yield successive n-sized chunks from lst.""" for i in range(0, len(lst), chunk_size): yield lst[i:i + chunk_size]