Spaces:

rashid996958
/

AnyRAG-WebSearch

Sleeping

Rashid Ali

initial commit

aaa9e08 7 months ago

4.09 kB

	# # ai_doc_query_agent/app/ingestion.py

	# from langchain.text_splitter import RecursiveCharacterTextSplitter
	# from langchain.document_loaders import UnstructuredFileLoader

	# def process_document(file_path):
	# loader = UnstructuredFileLoader(file_path)
	# docs = loader.load()
	# splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
	# chunks = splitter.split_documents(docs)
	# return chunks

	# #test

	"""
	ingest.py — Multi-modal document ingestion and chunking for AnyRAG
	Supports: Text, PDF, Images, Audio, CSV, JSON
	"""

	import os
	from pathlib import Path
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.document_loaders import UnstructuredPDFLoader

	from langchain_community.document_loaders import (JSONLoader,
	UnstructuredImageLoader,
	CSVLoader,
	UnstructuredFileLoader)

	import pytesseract
	from PIL import Image
	import whisper
	from langchain.schema import Document

	from dotenv import load_dotenv
	# Load environment variables from .env file
	load_dotenv()

	# client = OpenAI()



	# -------------------------------
	# UTILS: Determine file type
	# -------------------------------
	def get_file_type(file_path: str) -> str:
	ext = Path(file_path).suffix.lower()
	if ext in [".txt", ".md", ".docx"]:
	return "text"
	elif ext in [".pdf"]:
	return "pdf"
	elif ext in [".jpg", ".jpeg", ".png"]:
	return "image"
	elif ext in [".mp3", ".wav", ".m4a"]:
	return "audio"
	elif ext in [".csv"]:
	return "csv"
	elif ext in [".json"]:
	return "json"
	else:
	return "unknown"


	# -------------------------------
	# LOADERS for different modalities
	# -------------------------------

	def load_text(file_path):
	loader = UnstructuredFileLoader(file_path)
	return loader.load()

	def load_pdf(file_path):
	loader = UnstructuredPDFLoader(file_path)
	return loader.load()

	def load_image(file_path):
	"""Extract text from image using OCR"""
	image = Image.open(file_path)
	text = pytesseract.image_to_string(image)
	return [Document(page_content=text, metadata={"source": file_path, "modality": "image"})]

	def load_audio(file_path):
	"""Transcribe audio using Whisper"""
	model = whisper.load_model("base")
	result = model.transcribe(file_path)
	text = result["text"]
	return [Document(page_content=text, metadata={"source": file_path, "modality": "audio"})]

	# return [{"page_content": text, "metadata": {"source": file_path, "modality": "audio"}}]

	def load_csv(file_path):
	loader = CSVLoader(file_path)
	return loader.load()

	def load_json(file_path):
	loader = JSONLoader(file_path)
	return loader.load()


	# -------------------------------
	# CHUNKING PIPELINE
	# -------------------------------

	def chunk_documents(docs, chunk_size=500, chunk_overlap=100):
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	)
	return splitter.split_documents(docs)


	# -------------------------------
	# MAIN PROCESSOR
	# -------------------------------

	def process_document(file_path: str):
	"""Detects file type, loads, and chunks the document"""
	file_type = get_file_type(file_path)
	print(f"🔍 Detected file type: {file_type}")

	if file_type == "text":
	docs = load_text(file_path)
	elif file_type == "pdf":
	docs = load_pdf(file_path)
	elif file_type == "image":
	docs = load_image(file_path)
	elif file_type == "audio":
	docs = load_audio(file_path)
	print(voice.page_content for voice in docs)
	elif file_type == "csv":
	docs = load_csv(file_path)
	elif file_type == "json":
	docs = load_json(file_path)
	else:
	raise ValueError(f"Unsupported file type: {file_type}")

	chunks = chunk_documents(docs)
	print(f"✅ Processed {len(chunks)} chunks from {file_type} file.")
	return chunks