Spaces:

rockerritesh
/

documaticai

Running

App Files Files Community

rockerritesh commited on Feb 25, 2025

Commit

fc9f4fe

verified ·

1 Parent(s): bb3f8b3

Upload 5 files

Browse files

Files changed (5) hide show

main.py +69 -0
models.py +7 -0
prompts.py +27 -0
requirements.txt +21 -0
utils.py +148 -0

main.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from fastapi import FastAPI, File, UploadFile
+from fastapi.responses import JSONResponse
+from PIL import Image
+import io
+import json
+from utils import get_text
+from json_flatten import flatten
+app = FastAPI(
+    title="DOCUMANTICAI API",
+    description="""
+    This API allows you to upload an image and get a formatted response with details and image information.
+    """
+)
+@app.post("/upload")
+async def upload_image(fields:str, model:str, file: UploadFile = File(...)):
+    """
+    ### Endpoint Description:
+    Extract form data from an uploaded image and return the extracted data in JSON format.
+    #### Request Parameters:
+    - `file`: The image file to extract data from. (Required)
+    #### Response:
+    ### Notes:
+    - The image should be in a supported format (e.g., PNG, JPEG).
+    - The data extracted will vary depending on the image content.
+    """
+    try:
+        # Load the uploaded image
+        image = Image.open(io.BytesIO(await file.read()))
+        # Example: Get image details
+        image_details = {
+            "filename": file.filename,
+            "format": image.format,
+            "size": image.size,  # (width, height)
+            "mode": image.mode
+        }
+        response = get_text(image,image_details['filename'], model, fields)
+        # Step 1: Convert the escaped JSON string to a proper dictionary
+        # Step 2: Convert the response to a proper dictionary
+        response = json.loads(response)
+        # Step 3: Convert fields and values into key-value pairs
+        if 'fields' in response and 'values' in response:
+            response = dict(zip(response['fields'], response['values']))
+        # response flattening
+        response = flatten(response)
+        # Process image (example: return metadata)
+        return JSONResponse(content={"response": response, "details": image_details})
+    except Exception as e:
+        return JSONResponse(content={"error": str(e)}, status_code=400)
+@app.post("/list_models")
+async def list_models():
+    """
+    ### Endpoint Description:
+    List available models for text generation.
+    #### Response:
+    - A list of available models for text generation.
+    """
+    return JSONResponse(content={"models": ["gpt-4o-mini", "gpt-4o", "deepseek-chat", "claude-3-5-sonnet-20241022", "llama_llm_d","llama_llm_o"]})

models.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from pydantic import BaseModel, Field
+from typing import Optional, List
+class FormDetails(BaseModel):
+    fields: List[str]
+    values: List[str]

prompts.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Define the system_prompt template
+system_prompt_template = """
+    You are an OCR-like data extraction tool that extracts information from the image provided of a form for {}.
+    1. Please extract the data in this image, and then output into JSON.
+    2. Please keep the keys and values of the JSON in the original language.
+    3. The type of data you might encounter in the image includes but is not limited to: names, dates, checkboxes, etc.
+    4. If there are tables in the image, capture all of the rows and columns in the JSON object.
+    Even if a column is blank, include it as a key in the JSON object with a null value.
+    5. Don't interpolate or make up data.
+    6. Please maintain the table structure of the charges, i.e., capture all of the rows and columns in the JSON object.
+    7. Return null if the data is not available.
+    8. If no checkboxes are selected, just return null.
+    9. Triple check any numbers provided in the attached image.
+    10. Properly check which row the data belongs to.
+    EXAMPLE JSON OUTPUT:
+    {}
+"""
+prompt = """Please extract the [{}] details from this image, and then output into JSON."""
+# # Fill in the mission dynamically using the format function
+# mission = "processing medical records"  # Replace with your specific mission
+# system_prompt = system_prompt_template.format(mission)
+# # Print or use the resulting system_prompt
+# print(system_prompt)

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+anthropic==0.44.0
+fastapi==0.115.6
+httpcore==1.0.7
+httpx==0.28.1
+ipykernel==6.29.5
+ipython==8.31.0
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+matplotlib-inline==0.1.7
+nest-asyncio==1.6.0
+openai==1.59.9
+pillow==11.1.0
+pydantic==2.10.5
+python-dotenv==1.0.1
+python-multipart==0.0.20
+tqdm==4.67.1
+uvicorn==0.34.0
+llama-index-core==0.12.14
+llama-index-readers-file==0.4.4
+llama-parse==0.5.20
+json-flatten==0.3.0

utils.py ADDED Viewed

	@@ -0,0 +1,148 @@

+from dotenv import load_dotenv
+import os
+# import openai
+from openai import OpenAI
+from models import FormDetails
+from prompts import system_prompt_template, prompt
+import base64
+from io import BytesIO
+import anthropic
+import nest_asyncio
+from llama_parse import LlamaParse
+nest_asyncio.apply()
+load_dotenv()
+# set up parser
+parser = LlamaParse(
+    result_type="markdown"  # "markdown" and "text" are available
+)
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+# Function to encode the image
+def encode_image(image):
+    buffer = BytesIO()
+    # Save the image to the buffer in its format (e.g., PNG, JPEG)
+    image.save(buffer, format=image.format)
+    # Get the byte data
+    image_bytes = buffer.getvalue()
+    return base64.b64encode(image_bytes).decode("utf-8")
+def get_text(image, filename, model, fields="ALL"):
+    # print(model)
+    # Getting the base64 string
+    base64_image = encode_image(image)
+    # check if model name starts with gpt
+    if model.startswith("gpt"):
+        print("gpt")
+        client = OpenAI(api_key = OPENAI_API_KEY)
+        response = client.beta.chat.completions.parse(
+            model=model,
+            messages=[
+                {
+                    "role":"system",
+                    "content":system_prompt_template.format(filename,FormDetails.schema_json())
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt.format(fields),
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+            response_format=FormDetails,
+            temperature=0.0,
+        )
+        response = response.choices[0].message.content
+    # check if model name starts with claude
+    elif model.startswith("claude"):
+        print("claude")
+        client = anthropic.Anthropic()
+        message = client.messages.create(
+            model=model,
+            max_tokens=1024,
+            system= system_prompt_template.format(filename,FormDetails.schema_json()) + " In following Json format,class FormDetails(BaseModel):\nfields: List[str]\nvalues: List[str] ",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": "image/png",
+                                "data": base64_image,
+                            },
+                        },
+                        {
+                            "type": "text",
+                            "text": prompt.format(fields),
+                        }
+                    ],
+                }
+            ],
+            temperature=0.0,
+        )
+        response = message.content[0].text
+    # check if model name starts with llama_llm
+    elif model.startswith("llama_llm"):
+        print("llama_llm")
+        # Ensure the image is in RGB mode (to handle RGBA images)
+        if image.mode == "RGBA":
+            image = image.convert("RGB")
+        # save image to a file
+        image.save("image.jpg")
+        # parse the image
+        text = parser.load_data("image.jpg")
+        if model == "llama_llm_o":
+            client = OpenAI(api_key = OPENAI_API_KEY)
+            response = client.beta.chat.completions.parse(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role":"system",
+                        "content":system_prompt_template.format(filename,FormDetails.schema_json())
+                    },
+                    {
+                        "role": "user",
+                        "content": f"{prompt.format(fields)} \n Knowledge Base {text}"
+                    }
+                ],
+                response_format=FormDetails,
+                temperature=0.0,
+            )
+            response = response.choices[0].message.content
+        elif model == "llama_llm_d":
+            #deepseek
+            print("deepseek")
+            client = OpenAI(api_key=os.getenv('DEEPSEEK_API_KEY'), base_url=os.getenv('DEEPSEEK_API_URL'))
+            response = client.chat.completions.create(
+                        model="deepseek-chat",
+                        messages=[
+                                    {
+                                        "role":"system",
+                                        "content":system_prompt_template.format(filename,FormDetails.schema_json())
+                                    },
+                                    {
+                                        "role": "user",
+                                        "content": f"{prompt.format(fields)} \n Knowledge Base {text}"
+                                    }
+                                ],
+                        stream=False,
+                        response_format={
+                                            'type': 'json_object'
+                                        }
+                    )
+            response = response.choices[0].message.content
+    # print(response)
+    return response