Spaces:

pvanand
/

documind-api

Sleeping

App Files Files Community

pvanand commited on Dec 11, 2024

Commit

5d42805

verified ·

1 Parent(s): d53c11a

Upload 7 files

Browse files

Files changed (7) hide show

Dockerfile +20 -0
docker-compose.yml +10 -0
document_rag_router.py +400 -0
main.py +293 -0
readme.md +91 -0
requirements.txt +12 -0
utils.py +253 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.11-slim
+# Install Tkinter dependencies
+RUN apt-get update && apt-get install -y \
+    tk \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt \
+    && pip install torch --index-url https://download.pytorch.org/whl/cpu \
+    && pip install sentence-transformers
+COPY . .
+EXPOSE 80
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--log-level", "debug"]

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,10 @@

+services:
+  rag-api:
+    build: .
+    container_name: rag-api
+    restart: unless-stopped
+    environment:
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+    ports:
+      - "9004:80"

document_rag_router.py ADDED Viewed

	@@ -0,0 +1,400 @@

+from fastapi import UploadFile, File, Form, HTTPException, APIRouter
+from typing import List, Optional, Dict, Tuple
+import lancedb
+from lancedb.pydantic import LanceModel, Vector
+from lancedb.embeddings import get_registry
+import pandas as pd
+from utils import process_pdf_to_chunks
+import hashlib
+import uuid
+import json
+from datetime import datetime
+from pydantic import BaseModel
+import logging
+# Create router
+router = APIRouter(
+    prefix="/rag",
+    tags=["rag"]
+)
+# Initialize LanceDB and embedding model
+db = lancedb.connect("/tmp/db")
+model = get_registry().get("sentence-transformers").create(
+    name="Snowflake/snowflake-arctic-embed-xs",
+    device="cpu"
+)
+def get_user_collection(user_id: str, collection_name: str) -> str:
+    """Generate user-specific collection name"""
+    return f"{user_id}_{collection_name}"
+class DocumentChunk(LanceModel):
+    text: str = model.SourceField()
+    vector: Vector(model.ndims()) = model.VectorField()
+    document_id: str
+    chunk_index: int
+    file_name: str
+    file_type: str
+    created_date: str
+    collection_id: str
+    user_id: str
+    metadata_json: str
+    char_start: int
+    char_end: int
+    page_numbers: List[int]
+    images: List[str]
+class QueryInput(BaseModel):
+    collection_id: str
+    query: str
+    top_k: Optional[int] = 3
+    user_id: str
+class SearchResult(BaseModel):
+    text: str
+    distance: float
+    metadata: Dict  # Added metadata field
+class SearchResponse(BaseModel):
+    results: List[SearchResult]
+async def process_file(file: UploadFile, collection_id: str, user_id: str) -> Tuple[List[dict], str]:
+    """Process single file and return chunks with metadata"""
+    content = await file.read()
+    file_type = file.filename.split('.')[-1].lower()
+    chunks = []
+    doc_id = ""
+    if file_type == 'pdf':
+        chunks, doc_id = process_pdf_to_chunks(
+            pdf_content=content,
+            file_name=file.filename
+        )
+    elif file_type == 'txt':
+        doc_id = hashlib.sha256(content).hexdigest()[:4]
+        text_content = content.decode('utf-8')
+        chunks = [{
+            "text": text_content,
+            "metadata": {
+                "created_date": datetime.now().isoformat(),
+                "file_name": file.filename,
+                "document_id": doc_id,
+                "user_id": user_id,
+                "location": {
+                    "chunk_index": 0,
+                    "char_start": 0,
+                    "char_end": len(text_content),
+                    "pages": [1],
+                    "total_chunks": 1
+                },
+                "images": []
+            }
+        }]
+    return chunks, doc_id
+@router.post("/upload_files")
+async def upload_files(
+    files: List[UploadFile] = File(...),
+    collection_name: Optional[str] = Form(None),
+    user_id: str = Form(...)
+):
+    try:
+        collection_id = get_user_collection(
+            user_id,
+            collection_name if collection_name else f"col_{uuid.uuid4().hex[:8]}"
+        )
+        all_chunks = []
+        doc_ids = {}
+        for file in files:
+            try:
+                chunks, doc_id = await process_file(file, collection_id, user_id)
+                for chunk in chunks:
+                    chunk_data = {
+                        "text": chunk["text"],
+                        "document_id": chunk["metadata"]["document_id"],
+                        "chunk_index": chunk["metadata"]["location"]["chunk_index"],
+                        "file_name": chunk["metadata"]["file_name"],
+                        "file_type": file.filename.split('.')[-1].lower(),
+                        "created_date": chunk["metadata"]["created_date"],
+                        "collection_id": collection_id,
+                        "user_id": user_id,
+                        "metadata_json": json.dumps(chunk["metadata"]),
+                        "char_start": chunk["metadata"]["location"]["char_start"],
+                        "char_end": chunk["metadata"]["location"]["char_end"],
+                        "page_numbers": chunk["metadata"]["location"]["pages"],
+                        "images": chunk["metadata"].get("images", [])
+                    }
+                    all_chunks.append(chunk_data)
+                doc_ids[doc_id] = file.filename
+            except Exception as e:
+                logging.error(f"Error processing file {file.filename}: {str(e)}")
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Error processing file {file.filename}: {str(e)}"
+                )
+        try:
+            table = db.open_table(collection_id)
+        except Exception as e:
+            logging.error(f"Error opening table: {str(e)}")
+            try:
+                table = db.create_table(
+                    collection_id,
+                    schema=DocumentChunk,
+                    mode="create"
+                )
+                # Create FTS index on the text column for hybrid search support
+                # table.create_fts_index(
+                #     field_names="text",
+                #     replace=True,
+                #     tokenizer_name="en_stem",  # Use English stemming
+                #     lower_case=True,  # Convert text to lowercase
+                #     remove_stop_words=True,  # Remove common words like "the", "is", "at"
+                #     writer_heap_size=1024 * 1024 * 1024  # 1GB heap size
+                # )
+            except Exception as e:
+                logging.error(f"Error creating table: {str(e)}")
+                raise HTTPException(
+                    status_code=500,
+                    detail=f"Error creating database table: {str(e)}"
+                )
+        try:
+            df = pd.DataFrame(all_chunks)
+            table.add(data=df)
+        except Exception as e:
+            logging.error(f"Error adding data to table: {str(e)}")
+            raise HTTPException(
+                status_code=500,
+                detail=f"Error adding data to database: {str(e)}"
+            )
+        return {
+            "message": f"Successfully processed {len(files)} files",
+            "collection_id": collection_id,
+            "total_chunks": len(all_chunks),
+            "user_id": user_id,
+            "document_ids": doc_ids
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logging.error(f"Unexpected error during file upload: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Unexpected error: {str(e)}"
+        )
+@router.get("/get_document/{collection_id}/{document_id}")
+async def get_document(
+    collection_id: str,
+    document_id: str,
+    user_id: str
+):
+    try:
+        table = db.open_table(f"{user_id}_{collection_id}")
+    except Exception as e:
+        logging.error(f"Error opening table: {str(e)}")
+        raise HTTPException(
+            status_code=404,
+            detail=f"Collection not found: {str(e)}"
+        )
+    try:
+        chunks = table.to_pandas()
+        doc_chunks = chunks[
+            (chunks['document_id'] == document_id) &
+            (chunks['user_id'] == user_id)
+        ].sort_values('chunk_index')
+        if len(doc_chunks) == 0:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Document {document_id} not found in collection {collection_id}"
+            )
+        return {
+            "document_id": document_id,
+            "file_name": doc_chunks.iloc[0]['file_name'],
+            "chunks": [
+                {
+                    "text": row['text'],
+                    "metadata": json.loads(row['metadata_json'])
+                }
+                for _, row in doc_chunks.iterrows()
+            ]
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logging.error(f"Error retrieving document: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Error retrieving document: {str(e)}"
+        )
+@router.post("/query_collection", response_model=SearchResponse)
+async def query_collection(input_data: QueryInput):
+    try:
+        collection_id = get_user_collection(input_data.user_id, input_data.collection_id)
+        try:
+            table = db.open_table(collection_id)
+        except Exception as e:
+            logging.error(f"Error opening table: {str(e)}")
+            raise HTTPException(
+                status_code=404,
+                detail=f"Collection not found: {str(e)}"
+            )
+        try:
+            results = (
+                table.search(input_data.query)
+                .where(f"user_id = '{input_data.user_id}'")
+                .limit(input_data.top_k)
+                .to_list()
+            )
+        except Exception as e:
+            logging.error(f"Error searching collection: {str(e)}")
+            raise HTTPException(
+                status_code=500,
+                detail=f"Error searching collection: {str(e)}"
+            )
+        return SearchResponse(results=[
+            SearchResult(
+                text=r['text'],
+                distance=float(r['_distance']),
+                metadata=json.loads(r['metadata_json'])
+            )
+            for r in results
+        ])
+    except HTTPException:
+        raise
+    except Exception as e:
+        logging.error(f"Unexpected error during query: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Unexpected error: {str(e)}"
+        )
+@router.get("/list_collections")
+async def list_collections(user_id: str):
+    try:
+        all_collections = db.table_names()
+        user_collections = [
+            c for c in all_collections
+            if c.startswith(f"{user_id}_")
+        ]
+        # Get documents for each collection
+        collections_info = []
+        for collection_name in user_collections:
+            try:
+                table = db.open_table(collection_name)
+                df = table.to_pandas()
+                # Group by document_id to get unique documents
+                documents = df.groupby('document_id').agg({
+                    'file_name': 'first',
+                    'created_date': 'first'
+                }).reset_index()
+                collections_info.append({
+                    "collection_id": collection_name.replace(f"{user_id}_", ""),
+                    "documents": [
+                        {
+                            "document_id": row['document_id'],
+                            "file_name": row['file_name'],
+                            "created_date": row['created_date']
+                        }
+                        for _, row in documents.iterrows()
+                    ]
+                })
+            except Exception as e:
+                logging.error(f"Error processing collection {collection_name}: {str(e)}")
+                continue
+        return {"collections": collections_info}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.delete("/delete_collection/{collection_id}")
+async def delete_collection(collection_id: str, user_id: str):
+    try:
+        full_collection_id = f"{user_id}_{collection_id}"
+        # Check if collection exists
+        try:
+            table = db.open_table(full_collection_id)
+        except Exception as e:
+            logging.error(f"Collection not found: {str(e)}")
+            raise HTTPException(
+                status_code=404,
+                detail=f"Collection {collection_id} not found"
+            )
+        # Verify ownership
+        if not full_collection_id.startswith(f"{user_id}_"):
+            logging.error(f"Unauthorized deletion attempt for collection {collection_id} by user {user_id}")
+            raise HTTPException(
+                status_code=403,
+                detail="Not authorized to delete this collection"
+            )
+        try:
+            db.drop_table(full_collection_id)
+        except Exception as e:
+            logging.error(f"Error deleting collection {collection_id}: {str(e)}")
+            raise HTTPException(
+                status_code=500,
+                detail=f"Error deleting collection: {str(e)}"
+            )
+        return {
+            "message": f"Collection {collection_id} deleted successfully",
+            "collection_id": collection_id
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logging.error(f"Unexpected error deleting collection {collection_id}: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Unexpected error: {str(e)}"
+        )
+@router.post("/query_collection_tool")
+async def query_collection_tool(input_data: QueryInput):
+    try:
+        response = await query_collection(input_data)
+        results = []
+        # Access response directly since it's a Pydantic model
+        for r in response.results:
+            result_dict = {
+                "text": r.text,
+                "distance": r.distance,
+                "metadata": {
+                    "document_id": r.metadata.get("document_id"),
+                    "chunk_index": r.metadata.get("location", {}).get("chunk_index")
+                }
+            }
+            results.append(result_dict)
+        return str(results)
+    except Exception as e:
+        logging.error(f"Unexpected error during query: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Unexpected error: {str(e)}"
+        )

main.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import uuid
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
+from langchain_core.messages import (
+    BaseMessage,
+    HumanMessage,
+    trim_messages,
+)
+from langchain_core.tools import tool
+from langchain_openai import ChatOpenAI
+from langgraph.checkpoint.memory import MemorySaver
+from langgraph.prebuilt import create_react_agent
+from pydantic import BaseModel
+import json
+from typing import Optional, Annotated
+from langchain_core.runnables import RunnableConfig
+from langgraph.prebuilt import InjectedState
+from document_rag_router import router as document_rag_router
+from document_rag_router import QueryInput, query_collection, SearchResult
+from fastapi import HTTPException
+import requests
+from sse_starlette.sse import EventSourceResponse
+from fastapi.middleware.cors import CORSMiddleware
+import re
+app = FastAPI()
+app.include_router(document_rag_router)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@tool
+def get_user_age(name: str) -> str:
+    """Use this tool to find the user's age."""
+    if "bob" in name.lower():
+        return "42 years old"
+    return "41 years old"
+@tool
+async def query_documents(
+    query: str,
+    config: RunnableConfig,
+    #state: Annotated[dict, InjectedState]
+) -> str:
+    """Use this tool to retrieve relevant data from the collection.
+    Args:
+        query: The search query to find relevant document passages
+    """
+    # Get collection_id and user_id from config
+    thread_config = config.get("configurable", {})
+    collection_id = thread_config.get("collection_id")
+    user_id = thread_config.get("user_id")
+    if not collection_id or not user_id:
+        return "Error: collection_id and user_id are required in the config"
+    try:
+        # Create query input
+        input_data = QueryInput(
+            collection_id=collection_id,
+            query=query,
+            user_id=user_id,
+            top_k=6
+        )
+        response = await query_collection(input_data)
+        results = []
+        # Access response directly since it's a Pydantic model
+        for r in response.results:
+            result_dict = {
+                "text": r.text,
+                "distance": r.distance,
+                "metadata": {
+                    "document_id": r.metadata.get("document_id"),
+                    "chunk_index": r.metadata.get("location", {}).get("chunk_index")
+                }
+            }
+            results.append(result_dict)
+        return str(results)
+    except Exception as e:
+        print(e)
+        return f"Error querying documents: {e} PAUSE AND ASK USER FOR HELP"
+async def query_documents_raw(
+    query: str,
+    config: RunnableConfig,
+    #state: Annotated[dict, InjectedState]
+) -> SearchResult:
+    """Use this tool to retrieve relevant data from the collection.
+    Args:
+        query: The search query to find relevant document passages
+    """
+    # Get collection_id and user_id from config
+    thread_config = config.get("configurable", {})
+    collection_id = thread_config.get("collection_id")
+    user_id = thread_config.get("user_id")
+    if not collection_id or not user_id:
+        return "Error: collection_id and user_id are required in the config"
+    try:
+        # Create query input
+        input_data = QueryInput(
+            collection_id=collection_id,
+            query=query,
+            user_id=user_id,
+            top_k=6
+        )
+        response = await query_collection(input_data)
+        return response.results
+    except Exception as e:
+        print(e)
+        return f"Error querying documents: {e} PAUSE AND ASK USER FOR HELP"
+memory = MemorySaver()
+model = ChatOpenAI(model="gpt-4o-mini", streaming=True)
+def state_modifier(state) -> list[BaseMessage]:
+    return trim_messages(
+        state["messages"],
+        token_counter=len,
+        max_tokens=16000,
+        strategy="last",
+        start_on="human",
+        include_system=True,
+        allow_partial=False,
+    )
+agent = create_react_agent(
+    model,
+    tools=[query_documents],
+    checkpointer=memory,
+    state_modifier=state_modifier,
+)
+class ChatInput(BaseModel):
+    message: str
+    thread_id: Optional[str] = None
+    collection_id: Optional[str] = None
+    user_id: Optional[str] = None
+@app.post("/chat")
+async def chat(input_data: ChatInput):
+    thread_id = input_data.thread_id or str(uuid.uuid4())
+    config = {
+        "configurable": {
+            "thread_id": thread_id,
+            "collection_id": input_data.collection_id,
+            "user_id": input_data.user_id
+        }
+    }
+    input_message = HumanMessage(content=input_data.message)
+    async def generate():
+        async for event in agent.astream_events(
+            {"messages": [input_message]},
+            config,
+            version="v2"
+        ):
+            kind = event["event"]
+            if kind == "on_chat_model_stream":
+                content = event["data"]["chunk"].content
+                if content:
+                    yield f"{json.dumps({'type': 'token', 'content': content})}"
+            elif kind == "on_tool_start":
+                tool_input = str(event['data'].get('input', ''))
+                yield f"{json.dumps({'type': 'tool_start', 'tool': event['name'], 'input': tool_input})}"
+            elif kind == "on_tool_end":
+                tool_output = str(event['data'].get('output', ''))
+                yield f"{json.dumps({'type': 'tool_end', 'tool': event['name'], 'output': tool_output})}"
+    return EventSourceResponse(
+        generate(),
+        media_type="text/event-stream"
+    )
+async def clean_tool_input(tool_input: str):
+    # Use regex to parse the first key and value
+    pattern = r"{\s*'([^']+)':\s*'([^']+)'"
+    match = re.search(pattern, tool_input)
+    if match:
+        key, value = match.groups()
+        return {key: value}
+    return [tool_input]
+async def clean_tool_response(tool_output: str):
+    """Clean and extract relevant information from tool response if it contains query_documents."""
+    if "query_documents" in tool_output:
+        try:
+            # First safely evaluate the string as a Python literal
+            import ast
+            print(tool_output)
+            # Extract the list string from the content
+            start = tool_output.find("[{")
+            end = tool_output.rfind("}]") + 2
+            if start >= 0 and end > 0:
+                list_str = tool_output[start:end]
+                # Convert string to Python object using ast.literal_eval
+                results = ast.literal_eval(list_str)
+                # Return only relevant fields
+                return [{"text": r["text"], "document_id": r["metadata"]["document_id"]}
+                       for r in results]
+        except SyntaxError as e:
+            print(f"Syntax error in parsing: {e}")
+            return f"Error parsing document results: {str(e)}"
+        except Exception as e:
+            print(f"General error: {e}")
+            return f"Error processing results: {str(e)}"
+    return tool_output
+@app.post("/chat2")
+async def chat2(input_data: ChatInput):
+    thread_id = input_data.thread_id or str(uuid.uuid4())
+    config = {
+        "configurable": {
+            "thread_id": thread_id,
+            "collection_id": input_data.collection_id,
+            "user_id": input_data.user_id
+        }
+    }
+    input_message = HumanMessage(content=input_data.message)
+    async def generate():
+        async for event in agent.astream_events(
+            {"messages": [input_message]},
+            config,
+            version="v2"
+        ):
+            kind = event["event"]
+            if kind == "on_chat_model_stream":
+                content = event["data"]["chunk"].content
+                if content:
+                    yield f"{json.dumps({'type': 'token', 'content': content})}"
+            elif kind == "on_tool_start":
+                tool_name = event['name']
+                tool_input = event['data'].get('input', '')
+                clean_input = await clean_tool_input(str(tool_input))
+                yield f"{json.dumps({'type': 'tool_start', 'tool': tool_name, 'inputs': clean_input})}"
+            elif kind == "on_tool_end":
+                if "query_documents" in event['name']:
+                    print(event)
+                    raw_output = await query_documents_raw(str(event['data'].get('input', '')), config)
+                    try:
+                        serializable_output = [
+                            {
+                                "text": result.text,
+                                "distance": result.distance,
+                                "metadata": result.metadata
+                            }
+                            for result in raw_output
+                        ]
+                        yield f"{json.dumps({'type': 'tool_end', 'tool': event['name'], 'output': json.dumps(serializable_output)})}"
+                    except Exception as e:
+                        print(e)
+                        yield f"{json.dumps({'type': 'tool_end', 'tool': event['name'], 'output': str(raw_output)})}"
+                else:
+                    tool_name = event['name']
+                    raw_output = str(event['data'].get('output', ''))
+                    clean_output = await clean_tool_response(raw_output)
+                    yield f"{json.dumps({'type': 'tool_end', 'tool': tool_name, 'output': clean_output})}"
+    return EventSourceResponse(
+        generate(),
+        media_type="text/event-stream"
+    )
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}

readme.md ADDED Viewed

	@@ -0,0 +1,91 @@

+# Document RAG User API
+This is a FastAPI application for processing and managing document uploads, including PDF and text files. The application allows users to upload files, query collections, and manage their document data.
+## Features
+- Upload files in various formats (PDF, TXT, etc.)
+- Efficiently process and store document chunks with metadata
+- Perform queries on collections using user-defined input
+- Retrieve and list collections specific to each user
+- Remove collections as needed
+## Requirements
+- Python 3.7+
+- FastAPI
+- LanceDB
+- Pydantic
+- Pandas
+- Other dependencies as specified in `requirements.txt`
+## Installation
+1. Clone the repository:
+   ```bash
+   git clone <repository-url>
+   cd <repository-directory>
+   ```
+2. Install the required packages:
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. Run the application:
+   ```bash
+   uvicorn app.document_rag_user:app --reload
+   ```
+## API Endpoints
+### Upload Files
+- **POST** `/upload_files`
+  - Upload multiple files.
+  - Parameters:
+    - `files`: List of files to upload.
+    - `collection_name`: Optional name for the collection.
+    - `user_id`: User identifier.
+### Get Document
+- **GET** `/get_document/{collection_id}/{document_id}`
+  - Retrieve a specific document by its ID from a collection.
+  - Parameters:
+    - `collection_id`: ID of the collection.
+    - `document_id`: ID of the document.
+    - `user_id`: User identifier.
+### Query Collection
+- **POST** `/query_collection`
+  - Query a collection based on user input.
+  - Request Body:
+    - `collection_id`: ID of the collection.
+    - `query`: Search query.
+    - `top_k`: Optional number of top results to return (default is 3).
+    - `user_id`: User identifier.
+### List Collections
+- **GET** `/list_collections`
+  - List all collections for a specific user.
+  - Parameters:
+    - `user_id`: User identifier.
+### Delete Collection
+- **DELETE** `/delete_collection/{collection_id}`
+  - Delete a specific collection.
+  - Parameters:
+    - `collection_id`: ID of the collection to delete.
+    - `user_id`: User identifier.
+## Contributing
+Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.
+## License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+fastapi[standard]
+langchain-core
+langchain-openai
+langgraph
+pydantic
+pandas
+lancedb
+pymupdf
+langchain-text-splitters
+sse-starlette
+typing-extensions
+tantivy

utils.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Contains Utility functions for LLM and Database module. Along with some other misllaneous functions.
+"""
+from turtle import clear
+from pymupdf import pymupdf
+#from docx import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+#import tiktoken
+import base64
+import hashlib
+from typing import List
+from openai import OpenAI
+#from dotenv import load_dotenv
+import os
+import hashlib
+from datetime import datetime
+from typing import List, Optional, Dict, Any, Tuple
+def generate_file_id(file_bytes: bytes) -> str:
+    """Generate a 4-character unique file ID for given file."""
+    hash_obj = hashlib.sha256()
+    hash_obj.update(file_bytes[:4096])  # Still hash the first 4096 bytes
+    # Take first 2 bytes (16 bits) and convert to base36 (alphanumeric)
+    file_id = hex(int.from_bytes(hash_obj.digest()[:2], 'big'))[2:].zfill(4)
+    return file_id
+def process_pdf_to_chunks(
+    pdf_content: bytes,
+    file_name: str,
+    chunk_size: int = 512,
+    chunk_overlap: int = 20
+) -> Tuple[List[Dict[str, Any]], str]:
+    """
+    Process PDF content into chunks with column layout detection and proper image handling
+    """
+    doc = pymupdf.open(stream=pdf_content, filetype="pdf")
+    document_text = ""
+    all_images = []
+    image_positions = []
+    char_to_page_map = []
+    layout_info = {}
+    doc_id = generate_file_id(pdf_content)
+    def detect_columns(blocks):
+        """Detect if page has multiple columns based on text block positions"""
+        if not blocks:
+            return 1
+        x_positions = [block[0] for block in blocks]
+        x_positions.sort()
+        if len(x_positions) > 1:
+            gaps = [x_positions[i+1] - x_positions[i] for i in range(len(x_positions)-1)]
+            significant_gaps = [gap for gap in gaps if gap > page.rect.width * 0.15]
+            return len(significant_gaps) + 1
+        return 1
+    def sort_blocks_by_position(blocks, num_columns):
+        """Sort blocks by column and vertical position"""
+        if num_columns == 1:
+            return sorted(blocks, key=lambda b: b[0][1])  # b[0] is the bbox tuple, b[0][1] is y coordinate
+        page_width = page.rect.width
+        column_width = page_width / num_columns
+        def get_column(block):
+            bbox = block[0]  # Get the bounding box tuple
+            x_coord = bbox[0]  # Get the x coordinate (first element)
+            return int(x_coord // column_width)
+        return sorted(blocks, key=lambda b: (get_column(b), b[0][1]))
+    # Process each page
+    for page_num, page in enumerate(doc, 1):
+        blocks = page.get_text_blocks()
+        images = page.get_images()
+        # Detect layout
+        num_columns = detect_columns(blocks)
+        layout_info[page_num] = {
+            "columns": num_columns,
+            "width": page.rect.width,
+            "height": page.rect.height
+        }
+        # Create elements list with both text and images
+        elements = [(block[:4], block[4], "text") for block in blocks]
+        # Add images to elements
+        for img in images:
+            try:
+                img_rects = page.get_image_rects(img[0])
+                if img_rects and len(img_rects) > 0:
+                    img_bbox = img_rects[0]
+                    if img_bbox:
+                        img_data = (img_bbox, img[0], "image")
+                        elements.append(img_data)
+            except Exception as e:
+                print(f"Error processing image: {e}")
+                continue
+        # Sort elements by position
+        sorted_elements = sort_blocks_by_position(elements, num_columns)
+        # Process elements in order
+        page_text = ""
+        for element in sorted_elements:
+            if element[2] == "text":
+                text_content = element[1]
+                page_text += text_content
+                char_to_page_map.extend([page_num] * len(text_content))
+            else:
+                xref = element[1]
+                base_image = doc.extract_image(xref)
+                image_bytes = base_image["image"]
+                # Convert image bytes to base64
+                image_base64 = base64.b64encode(image_bytes).decode('utf-8')
+                all_images.append(image_base64)  # Store base64 encoded image
+                image_marker = f"\n<img_{len(all_images)-1}>\n"
+                image_positions.append((len(all_images)-1, len(document_text) + len(page_text)))
+                page_text += image_marker
+                char_to_page_map.extend([page_num] * len(image_marker))
+        document_text += page_text
+    # Create chunks
+    splitter = RecursiveCharacterTextSplitter(
+        #separators=["\n\n", "\n", " ", ""],
+        #keep_separator=True
+    ).from_tiktoken_encoder(
+        encoding_name="cl100k_base",
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap
+    )
+    text_chunks = splitter.split_text(document_text)
+    # Process chunks with metadata
+    processed_chunks = []
+    for chunk_idx, chunk in enumerate(text_chunks):
+        chunk_start = document_text.find(chunk)
+        chunk_end = chunk_start + len(chunk)
+        # Get page range and layout info
+        chunk_pages = sorted(set(char_to_page_map[chunk_start:chunk_end]))
+        chunk_layouts = {page: layout_info[page] for page in chunk_pages}
+        # Get images for this chunk
+        chunk_images = []
+        for img_idx, img_pos in image_positions:
+            if chunk_start <= img_pos <= chunk_end:
+                chunk_images.append(all_images[img_idx])  # Already base64 encoded
+        # Clean the chunk text
+        #cleaned_chunk = clean_text_for_llm(chunk)
+        chunk_dict = {
+            "text": chunk,
+            "metadata": {
+                "created_date": datetime.now().isoformat(),
+                "file_name": file_name,
+                "images": chunk_images,
+                "document_id": doc_id,
+                "location": {
+                    "char_start": chunk_start,
+                    "char_end": chunk_end,
+                    "pages": chunk_pages,
+                    "chunk_index": chunk_idx,
+                    "total_chunks": len(text_chunks),
+                    "layout": chunk_layouts
+                }
+            }
+        }
+        processed_chunks.append(chunk_dict)
+    return processed_chunks, doc_id
+# import re
+# import unicodedata
+# from typing import Optional
+# # Compile regex patterns once
+# HTML_TAG_PATTERN = re.compile(r'<[^>]+>')
+# MULTIPLE_NEWLINES = re.compile(r'\n\s*\n')
+# MULTIPLE_SPACES = re.compile(r'\s+')
+# def clean_text_for_llm(text: Optional[str]) -> str:
+#     """
+#     Efficiently clean and normalize text for LLM processing.
+#     """
+#     # Early returns
+#     if not text:
+#         return ""
+#     if not isinstance(text, str):
+#         try:
+#             text = str(text)
+#         except Exception:
+#             return ""
+#     # Single-pass character filtering
+#     chars = []
+#     prev_char = ''
+#     space_pending = False
+    # for char in text:
+    #     # Skip null bytes and most control characters
+    #     if char == '\0' or unicodedata.category(char).startswith('C'):
+    #         if char not in '\n\t':
+    #             continue
+    #     # Convert escaped sequences
+    #     if prev_char == '\\':
+    #         if char == 'n':
+    #             chars[-1] = '\n'
+    #             continue
+    #         if char == 't':
+    #             chars[-1] = '\t'
+    #             continue
+    #     # Handle whitespace
+    #     if char.isspace():
+    #         if not space_pending:
+    #             space_pending = True
+    #         continue
+    #     if space_pending:
+    #         chars.append(' ')
+    #         space_pending = False
+    #     chars.append(char)
+    #     prev_char = char
+    # # Join characters and perform remaining operations
+    # text = ''.join(chars)
+    # # Remove HTML tags
+    # #text = HTML_TAG_PATTERN.sub('', text)
+    # # Normalize Unicode in a single pass
+    # text = unicodedata.normalize('NFKC', text)
+    # # Clean up newlines
+    # text = MULTIPLE_NEWLINES.sub('\n', text)
+    # Final trim
+    # return text.strip()