Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- main.py +69 -0
- models.py +7 -0
- prompts.py +27 -0
- requirements.txt +21 -0
- utils.py +148 -0
main.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, File, UploadFile
|
| 2 |
+
from fastapi.responses import JSONResponse
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import io
|
| 5 |
+
import json
|
| 6 |
+
from utils import get_text
|
| 7 |
+
from json_flatten import flatten
|
| 8 |
+
|
| 9 |
+
app = FastAPI(
|
| 10 |
+
title="DOCUMANTICAI API",
|
| 11 |
+
description="""
|
| 12 |
+
This API allows you to upload an image and get a formatted response with details and image information.
|
| 13 |
+
"""
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
@app.post("/upload")
|
| 17 |
+
async def upload_image(fields:str, model:str, file: UploadFile = File(...)):
|
| 18 |
+
"""
|
| 19 |
+
### Endpoint Description:
|
| 20 |
+
Extract form data from an uploaded image and return the extracted data in JSON format.
|
| 21 |
+
|
| 22 |
+
#### Request Parameters:
|
| 23 |
+
- `file`: The image file to extract data from. (Required)
|
| 24 |
+
|
| 25 |
+
#### Response:
|
| 26 |
+
### Notes:
|
| 27 |
+
- The image should be in a supported format (e.g., PNG, JPEG).
|
| 28 |
+
- The data extracted will vary depending on the image content.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
# Load the uploaded image
|
| 33 |
+
image = Image.open(io.BytesIO(await file.read()))
|
| 34 |
+
|
| 35 |
+
# Example: Get image details
|
| 36 |
+
image_details = {
|
| 37 |
+
"filename": file.filename,
|
| 38 |
+
"format": image.format,
|
| 39 |
+
"size": image.size, # (width, height)
|
| 40 |
+
"mode": image.mode
|
| 41 |
+
}
|
| 42 |
+
response = get_text(image,image_details['filename'], model, fields)
|
| 43 |
+
# Step 1: Convert the escaped JSON string to a proper dictionary
|
| 44 |
+
# Step 2: Convert the response to a proper dictionary
|
| 45 |
+
response = json.loads(response)
|
| 46 |
+
|
| 47 |
+
# Step 3: Convert fields and values into key-value pairs
|
| 48 |
+
if 'fields' in response and 'values' in response:
|
| 49 |
+
response = dict(zip(response['fields'], response['values']))
|
| 50 |
+
|
| 51 |
+
# response flattening
|
| 52 |
+
response = flatten(response)
|
| 53 |
+
|
| 54 |
+
# Process image (example: return metadata)
|
| 55 |
+
return JSONResponse(content={"response": response, "details": image_details})
|
| 56 |
+
|
| 57 |
+
except Exception as e:
|
| 58 |
+
return JSONResponse(content={"error": str(e)}, status_code=400)
|
| 59 |
+
|
| 60 |
+
@app.post("/list_models")
|
| 61 |
+
async def list_models():
|
| 62 |
+
"""
|
| 63 |
+
### Endpoint Description:
|
| 64 |
+
List available models for text generation.
|
| 65 |
+
|
| 66 |
+
#### Response:
|
| 67 |
+
- A list of available models for text generation.
|
| 68 |
+
"""
|
| 69 |
+
return JSONResponse(content={"models": ["gpt-4o-mini", "gpt-4o", "deepseek-chat", "claude-3-5-sonnet-20241022", "llama_llm_d","llama_llm_o"]})
|
models.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import Optional, List
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class FormDetails(BaseModel):
|
| 6 |
+
fields: List[str]
|
| 7 |
+
values: List[str]
|
prompts.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Define the system_prompt template
|
| 2 |
+
system_prompt_template = """
|
| 3 |
+
You are an OCR-like data extraction tool that extracts information from the image provided of a form for {}.
|
| 4 |
+
1. Please extract the data in this image, and then output into JSON.
|
| 5 |
+
2. Please keep the keys and values of the JSON in the original language.
|
| 6 |
+
3. The type of data you might encounter in the image includes but is not limited to: names, dates, checkboxes, etc.
|
| 7 |
+
4. If there are tables in the image, capture all of the rows and columns in the JSON object.
|
| 8 |
+
Even if a column is blank, include it as a key in the JSON object with a null value.
|
| 9 |
+
5. Don't interpolate or make up data.
|
| 10 |
+
6. Please maintain the table structure of the charges, i.e., capture all of the rows and columns in the JSON object.
|
| 11 |
+
7. Return null if the data is not available.
|
| 12 |
+
8. If no checkboxes are selected, just return null.
|
| 13 |
+
9. Triple check any numbers provided in the attached image.
|
| 14 |
+
10. Properly check which row the data belongs to.
|
| 15 |
+
|
| 16 |
+
EXAMPLE JSON OUTPUT:
|
| 17 |
+
{}
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
prompt = """Please extract the [{}] details from this image, and then output into JSON."""
|
| 21 |
+
|
| 22 |
+
# # Fill in the mission dynamically using the format function
|
| 23 |
+
# mission = "processing medical records" # Replace with your specific mission
|
| 24 |
+
# system_prompt = system_prompt_template.format(mission)
|
| 25 |
+
|
| 26 |
+
# # Print or use the resulting system_prompt
|
| 27 |
+
# print(system_prompt)
|
requirements.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
anthropic==0.44.0
|
| 2 |
+
fastapi==0.115.6
|
| 3 |
+
httpcore==1.0.7
|
| 4 |
+
httpx==0.28.1
|
| 5 |
+
ipykernel==6.29.5
|
| 6 |
+
ipython==8.31.0
|
| 7 |
+
jupyter_client==8.6.3
|
| 8 |
+
jupyter_core==5.7.2
|
| 9 |
+
matplotlib-inline==0.1.7
|
| 10 |
+
nest-asyncio==1.6.0
|
| 11 |
+
openai==1.59.9
|
| 12 |
+
pillow==11.1.0
|
| 13 |
+
pydantic==2.10.5
|
| 14 |
+
python-dotenv==1.0.1
|
| 15 |
+
python-multipart==0.0.20
|
| 16 |
+
tqdm==4.67.1
|
| 17 |
+
uvicorn==0.34.0
|
| 18 |
+
llama-index-core==0.12.14
|
| 19 |
+
llama-index-readers-file==0.4.4
|
| 20 |
+
llama-parse==0.5.20
|
| 21 |
+
json-flatten==0.3.0
|
utils.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dotenv import load_dotenv
|
| 2 |
+
import os
|
| 3 |
+
# import openai
|
| 4 |
+
from openai import OpenAI
|
| 5 |
+
from models import FormDetails
|
| 6 |
+
from prompts import system_prompt_template, prompt
|
| 7 |
+
import base64
|
| 8 |
+
from io import BytesIO
|
| 9 |
+
import anthropic
|
| 10 |
+
import nest_asyncio
|
| 11 |
+
from llama_parse import LlamaParse
|
| 12 |
+
|
| 13 |
+
nest_asyncio.apply()
|
| 14 |
+
load_dotenv()
|
| 15 |
+
|
| 16 |
+
# set up parser
|
| 17 |
+
parser = LlamaParse(
|
| 18 |
+
result_type="markdown" # "markdown" and "text" are available
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# Function to encode the image
|
| 25 |
+
def encode_image(image):
|
| 26 |
+
buffer = BytesIO()
|
| 27 |
+
# Save the image to the buffer in its format (e.g., PNG, JPEG)
|
| 28 |
+
image.save(buffer, format=image.format)
|
| 29 |
+
# Get the byte data
|
| 30 |
+
image_bytes = buffer.getvalue()
|
| 31 |
+
return base64.b64encode(image_bytes).decode("utf-8")
|
| 32 |
+
|
| 33 |
+
def get_text(image, filename, model, fields="ALL"):
|
| 34 |
+
# print(model)
|
| 35 |
+
# Getting the base64 string
|
| 36 |
+
base64_image = encode_image(image)
|
| 37 |
+
# check if model name starts with gpt
|
| 38 |
+
if model.startswith("gpt"):
|
| 39 |
+
print("gpt")
|
| 40 |
+
client = OpenAI(api_key = OPENAI_API_KEY)
|
| 41 |
+
response = client.beta.chat.completions.parse(
|
| 42 |
+
model=model,
|
| 43 |
+
messages=[
|
| 44 |
+
{
|
| 45 |
+
"role":"system",
|
| 46 |
+
"content":system_prompt_template.format(filename,FormDetails.schema_json())
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"role": "user",
|
| 50 |
+
"content": [
|
| 51 |
+
{
|
| 52 |
+
"type": "text",
|
| 53 |
+
"text": prompt.format(fields),
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"type": "image_url",
|
| 57 |
+
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
|
| 58 |
+
},
|
| 59 |
+
],
|
| 60 |
+
}
|
| 61 |
+
],
|
| 62 |
+
response_format=FormDetails,
|
| 63 |
+
temperature=0.0,
|
| 64 |
+
)
|
| 65 |
+
response = response.choices[0].message.content
|
| 66 |
+
# check if model name starts with claude
|
| 67 |
+
elif model.startswith("claude"):
|
| 68 |
+
print("claude")
|
| 69 |
+
client = anthropic.Anthropic()
|
| 70 |
+
message = client.messages.create(
|
| 71 |
+
model=model,
|
| 72 |
+
max_tokens=1024,
|
| 73 |
+
system= system_prompt_template.format(filename,FormDetails.schema_json()) + " In following Json format,class FormDetails(BaseModel):\nfields: List[str]\nvalues: List[str] ",
|
| 74 |
+
messages=[
|
| 75 |
+
{
|
| 76 |
+
"role": "user",
|
| 77 |
+
"content": [
|
| 78 |
+
{
|
| 79 |
+
"type": "image",
|
| 80 |
+
"source": {
|
| 81 |
+
"type": "base64",
|
| 82 |
+
"media_type": "image/png",
|
| 83 |
+
"data": base64_image,
|
| 84 |
+
},
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"type": "text",
|
| 88 |
+
"text": prompt.format(fields),
|
| 89 |
+
}
|
| 90 |
+
],
|
| 91 |
+
}
|
| 92 |
+
],
|
| 93 |
+
temperature=0.0,
|
| 94 |
+
)
|
| 95 |
+
response = message.content[0].text
|
| 96 |
+
|
| 97 |
+
# check if model name starts with llama_llm
|
| 98 |
+
elif model.startswith("llama_llm"):
|
| 99 |
+
print("llama_llm")
|
| 100 |
+
# Ensure the image is in RGB mode (to handle RGBA images)
|
| 101 |
+
if image.mode == "RGBA":
|
| 102 |
+
image = image.convert("RGB")
|
| 103 |
+
# save image to a file
|
| 104 |
+
image.save("image.jpg")
|
| 105 |
+
# parse the image
|
| 106 |
+
text = parser.load_data("image.jpg")
|
| 107 |
+
if model == "llama_llm_o":
|
| 108 |
+
client = OpenAI(api_key = OPENAI_API_KEY)
|
| 109 |
+
response = client.beta.chat.completions.parse(
|
| 110 |
+
model="gpt-4o-mini",
|
| 111 |
+
messages=[
|
| 112 |
+
{
|
| 113 |
+
"role":"system",
|
| 114 |
+
"content":system_prompt_template.format(filename,FormDetails.schema_json())
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"role": "user",
|
| 118 |
+
"content": f"{prompt.format(fields)} \n Knowledge Base {text}"
|
| 119 |
+
}
|
| 120 |
+
],
|
| 121 |
+
response_format=FormDetails,
|
| 122 |
+
temperature=0.0,
|
| 123 |
+
)
|
| 124 |
+
response = response.choices[0].message.content
|
| 125 |
+
elif model == "llama_llm_d":
|
| 126 |
+
#deepseek
|
| 127 |
+
print("deepseek")
|
| 128 |
+
client = OpenAI(api_key=os.getenv('DEEPSEEK_API_KEY'), base_url=os.getenv('DEEPSEEK_API_URL'))
|
| 129 |
+
response = client.chat.completions.create(
|
| 130 |
+
model="deepseek-chat",
|
| 131 |
+
messages=[
|
| 132 |
+
{
|
| 133 |
+
"role":"system",
|
| 134 |
+
"content":system_prompt_template.format(filename,FormDetails.schema_json())
|
| 135 |
+
},
|
| 136 |
+
{
|
| 137 |
+
"role": "user",
|
| 138 |
+
"content": f"{prompt.format(fields)} \n Knowledge Base {text}"
|
| 139 |
+
}
|
| 140 |
+
],
|
| 141 |
+
stream=False,
|
| 142 |
+
response_format={
|
| 143 |
+
'type': 'json_object'
|
| 144 |
+
}
|
| 145 |
+
)
|
| 146 |
+
response = response.choices[0].message.content
|
| 147 |
+
# print(response)
|
| 148 |
+
return response
|