AnyRAG-WebSearch / src /ingestion.py
Rashid Ali
initial commit
aaa9e08
# # ai_doc_query_agent/app/ingestion.py
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.document_loaders import UnstructuredFileLoader
# def process_document(file_path):
# loader = UnstructuredFileLoader(file_path)
# docs = loader.load()
# splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
# chunks = splitter.split_documents(docs)
# return chunks
# #test
"""
ingest.py β€” Multi-modal document ingestion and chunking for AnyRAG
Supports: Text, PDF, Images, Audio, CSV, JSON
"""
import os
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import (JSONLoader,
UnstructuredImageLoader,
CSVLoader,
UnstructuredFileLoader)
import pytesseract
from PIL import Image
import whisper
from langchain.schema import Document
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# client = OpenAI()
# -------------------------------
# UTILS: Determine file type
# -------------------------------
def get_file_type(file_path: str) -> str:
ext = Path(file_path).suffix.lower()
if ext in [".txt", ".md", ".docx"]:
return "text"
elif ext in [".pdf"]:
return "pdf"
elif ext in [".jpg", ".jpeg", ".png"]:
return "image"
elif ext in [".mp3", ".wav", ".m4a"]:
return "audio"
elif ext in [".csv"]:
return "csv"
elif ext in [".json"]:
return "json"
else:
return "unknown"
# -------------------------------
# LOADERS for different modalities
# -------------------------------
def load_text(file_path):
loader = UnstructuredFileLoader(file_path)
return loader.load()
def load_pdf(file_path):
loader = UnstructuredPDFLoader(file_path)
return loader.load()
def load_image(file_path):
"""Extract text from image using OCR"""
image = Image.open(file_path)
text = pytesseract.image_to_string(image)
return [Document(page_content=text, metadata={"source": file_path, "modality": "image"})]
def load_audio(file_path):
"""Transcribe audio using Whisper"""
model = whisper.load_model("base")
result = model.transcribe(file_path)
text = result["text"]
return [Document(page_content=text, metadata={"source": file_path, "modality": "audio"})]
# return [{"page_content": text, "metadata": {"source": file_path, "modality": "audio"}}]
def load_csv(file_path):
loader = CSVLoader(file_path)
return loader.load()
def load_json(file_path):
loader = JSONLoader(file_path)
return loader.load()
# -------------------------------
# CHUNKING PIPELINE
# -------------------------------
def chunk_documents(docs, chunk_size=500, chunk_overlap=100):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
return splitter.split_documents(docs)
# -------------------------------
# MAIN PROCESSOR
# -------------------------------
def process_document(file_path: str):
"""Detects file type, loads, and chunks the document"""
file_type = get_file_type(file_path)
print(f"πŸ” Detected file type: {file_type}")
if file_type == "text":
docs = load_text(file_path)
elif file_type == "pdf":
docs = load_pdf(file_path)
elif file_type == "image":
docs = load_image(file_path)
elif file_type == "audio":
docs = load_audio(file_path)
print(voice.page_content for voice in docs)
elif file_type == "csv":
docs = load_csv(file_path)
elif file_type == "json":
docs = load_json(file_path)
else:
raise ValueError(f"Unsupported file type: {file_type}")
chunks = chunk_documents(docs)
print(f"βœ… Processed {len(chunks)} chunks from {file_type} file.")
return chunks