Dheeraj-13's picture
Initial deployment of RAG Assistant
26fe9a7
import os
import glob
import json
import argparse
from pathlib import Path
from typing import List, Dict
from pypdf import PdfReader
from bs4 import BeautifulSoup
from .chunk import create_chunks
def load_pdf(path: str) -> str:
reader = PdfReader(path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def load_html(path: str) -> str:
with open(path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
def clean_text(text: str) -> str:
# Basic cleaning
return text.replace('\x00', '')
def process_file(filepath: str) -> Dict:
filename = os.path.basename(filepath)
ext = os.path.splitext(filename)[1].lower()
if ext == '.pdf':
raw_text = load_pdf(filepath)
elif ext in ['.html', '.htm']:
raw_text = load_html(filepath)
elif ext in ['.txt', '.md']:
with open(filepath, 'r', encoding='utf-8') as f:
raw_text = f.read()
else:
print(f"Skipping unsupported file: {filename}")
return None
cleaned_text = clean_text(raw_text)
doc_id = filename.replace(' ', '_')
metadata = {
"source": filename,
"doc_id": doc_id,
"created_at": str(os.path.getctime(filepath))
}
chunks = create_chunks(cleaned_text, metadata)
return {
"metadata": metadata,
"chunks": [vars(c) for c in chunks] # Serialize Chunk objects
}
def ingest(input_dir: str, output_dir: str):
os.makedirs(output_dir, exist_ok=True)
files = []
for ext in ['*.pdf', '*.html', '*.txt', '*.md']:
files.extend(glob.glob(os.path.join(input_dir, ext))) # Simple glob, non-recursive
processed_count = 0
all_chunks = []
manifest = []
for f in files:
print(f"Processing {f}...")
try:
result = process_file(f)
if result:
# Save individual doc chunks? Or one big file?
# User req: "Saved processed artifacts to /data/processed/ with a manifest.json"
# "cleaned markdown per document"
# We'll save the full text as markdown and the chunks structure
out_name = result['metadata']['doc_id'] + ".json"
out_path = os.path.join(output_dir, out_name)
with open(out_path, 'w') as f_out:
json.dump(result, f_out, indent=2)
manifest.append({
"doc_id": result['metadata']['doc_id'],
"path": out_path,
"chunk_count": len(result['chunks'])
})
processed_count += 1
except Exception as e:
print(f"Error processing {f}: {e}")
# Save manifest
with open(os.path.join(output_dir, "manifest.json"), 'w') as f:
json.dump(manifest, f, indent=2)
print(f"Ingestion complete. Processed {processed_count} files.")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input", required=True, help="Input directory")
parser.add_argument("--out", required=True, help="Output directory")
args = parser.parse_args()
ingest(args.input, args.out)