import os import glob import json import argparse from pathlib import Path from typing import List, Dict from pypdf import PdfReader from bs4 import BeautifulSoup from .chunk import create_chunks def load_pdf(path: str) -> str: reader = PdfReader(path) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text def load_html(path: str) -> str: with open(path, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.decompose() text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) return text def clean_text(text: str) -> str: # Basic cleaning return text.replace('\x00', '') def process_file(filepath: str) -> Dict: filename = os.path.basename(filepath) ext = os.path.splitext(filename)[1].lower() if ext == '.pdf': raw_text = load_pdf(filepath) elif ext in ['.html', '.htm']: raw_text = load_html(filepath) elif ext in ['.txt', '.md']: with open(filepath, 'r', encoding='utf-8') as f: raw_text = f.read() else: print(f"Skipping unsupported file: {filename}") return None cleaned_text = clean_text(raw_text) doc_id = filename.replace(' ', '_') metadata = { "source": filename, "doc_id": doc_id, "created_at": str(os.path.getctime(filepath)) } chunks = create_chunks(cleaned_text, metadata) return { "metadata": metadata, "chunks": [vars(c) for c in chunks] # Serialize Chunk objects } def ingest(input_dir: str, output_dir: str): os.makedirs(output_dir, exist_ok=True) files = [] for ext in ['*.pdf', '*.html', '*.txt', '*.md']: files.extend(glob.glob(os.path.join(input_dir, ext))) # Simple glob, non-recursive processed_count = 0 all_chunks = [] manifest = [] for f in files: print(f"Processing {f}...") try: result = process_file(f) if result: # Save individual doc chunks? Or one big file? # User req: "Saved processed artifacts to /data/processed/ with a manifest.json" # "cleaned markdown per document" # We'll save the full text as markdown and the chunks structure out_name = result['metadata']['doc_id'] + ".json" out_path = os.path.join(output_dir, out_name) with open(out_path, 'w') as f_out: json.dump(result, f_out, indent=2) manifest.append({ "doc_id": result['metadata']['doc_id'], "path": out_path, "chunk_count": len(result['chunks']) }) processed_count += 1 except Exception as e: print(f"Error processing {f}: {e}") # Save manifest with open(os.path.join(output_dir, "manifest.json"), 'w') as f: json.dump(manifest, f, indent=2) print(f"Ingestion complete. Processed {processed_count} files.") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input", required=True, help="Input directory") parser.add_argument("--out", required=True, help="Output directory") args = parser.parse_args() ingest(args.input, args.out)