Spaces:
Sleeping
Sleeping
| # # ai_doc_query_agent/app/ingestion.py | |
| # from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| # from langchain.document_loaders import UnstructuredFileLoader | |
| # def process_document(file_path): | |
| # loader = UnstructuredFileLoader(file_path) | |
| # docs = loader.load() | |
| # splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) | |
| # chunks = splitter.split_documents(docs) | |
| # return chunks | |
| # #test | |
| """ | |
| ingest.py β Multi-modal document ingestion and chunking for AnyRAG | |
| Supports: Text, PDF, Images, Audio, CSV, JSON | |
| """ | |
| import os | |
| from pathlib import Path | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.document_loaders import UnstructuredPDFLoader | |
| from langchain_community.document_loaders import (JSONLoader, | |
| UnstructuredImageLoader, | |
| CSVLoader, | |
| UnstructuredFileLoader) | |
| import pytesseract | |
| from PIL import Image | |
| import whisper | |
| from langchain.schema import Document | |
| from dotenv import load_dotenv | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| # client = OpenAI() | |
| # ------------------------------- | |
| # UTILS: Determine file type | |
| # ------------------------------- | |
| def get_file_type(file_path: str) -> str: | |
| ext = Path(file_path).suffix.lower() | |
| if ext in [".txt", ".md", ".docx"]: | |
| return "text" | |
| elif ext in [".pdf"]: | |
| return "pdf" | |
| elif ext in [".jpg", ".jpeg", ".png"]: | |
| return "image" | |
| elif ext in [".mp3", ".wav", ".m4a"]: | |
| return "audio" | |
| elif ext in [".csv"]: | |
| return "csv" | |
| elif ext in [".json"]: | |
| return "json" | |
| else: | |
| return "unknown" | |
| # ------------------------------- | |
| # LOADERS for different modalities | |
| # ------------------------------- | |
| def load_text(file_path): | |
| loader = UnstructuredFileLoader(file_path) | |
| return loader.load() | |
| def load_pdf(file_path): | |
| loader = UnstructuredPDFLoader(file_path) | |
| return loader.load() | |
| def load_image(file_path): | |
| """Extract text from image using OCR""" | |
| image = Image.open(file_path) | |
| text = pytesseract.image_to_string(image) | |
| return [Document(page_content=text, metadata={"source": file_path, "modality": "image"})] | |
| def load_audio(file_path): | |
| """Transcribe audio using Whisper""" | |
| model = whisper.load_model("base") | |
| result = model.transcribe(file_path) | |
| text = result["text"] | |
| return [Document(page_content=text, metadata={"source": file_path, "modality": "audio"})] | |
| # return [{"page_content": text, "metadata": {"source": file_path, "modality": "audio"}}] | |
| def load_csv(file_path): | |
| loader = CSVLoader(file_path) | |
| return loader.load() | |
| def load_json(file_path): | |
| loader = JSONLoader(file_path) | |
| return loader.load() | |
| # ------------------------------- | |
| # CHUNKING PIPELINE | |
| # ------------------------------- | |
| def chunk_documents(docs, chunk_size=500, chunk_overlap=100): | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| ) | |
| return splitter.split_documents(docs) | |
| # ------------------------------- | |
| # MAIN PROCESSOR | |
| # ------------------------------- | |
| def process_document(file_path: str): | |
| """Detects file type, loads, and chunks the document""" | |
| file_type = get_file_type(file_path) | |
| print(f"π Detected file type: {file_type}") | |
| if file_type == "text": | |
| docs = load_text(file_path) | |
| elif file_type == "pdf": | |
| docs = load_pdf(file_path) | |
| elif file_type == "image": | |
| docs = load_image(file_path) | |
| elif file_type == "audio": | |
| docs = load_audio(file_path) | |
| print(voice.page_content for voice in docs) | |
| elif file_type == "csv": | |
| docs = load_csv(file_path) | |
| elif file_type == "json": | |
| docs = load_json(file_path) | |
| else: | |
| raise ValueError(f"Unsupported file type: {file_type}") | |
| chunks = chunk_documents(docs) | |
| print(f"β Processed {len(chunks)} chunks from {file_type} file.") | |
| return chunks | |