Spaces:
Sleeping
Sleeping
Commit
·
30857ba
1
Parent(s):
23ca6b0
Added source code
Browse files- Dockerfile +5 -20
- src/app.py +370 -0
- src/config.py +2 -0
- src/predict_output.py +559 -0
- src/requirements.txt +175 -0
Dockerfile
CHANGED
|
@@ -1,21 +1,6 @@
|
|
| 1 |
-
FROM python:3.
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
RUN apt-get update && apt-get install -y \
|
| 6 |
-
build-essential \
|
| 7 |
-
curl \
|
| 8 |
-
software-properties-common \
|
| 9 |
-
git \
|
| 10 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
-
|
| 12 |
-
COPY requirements.txt ./
|
| 13 |
-
COPY src/ ./src/
|
| 14 |
-
|
| 15 |
-
RUN pip3 install -r requirements.txt
|
| 16 |
-
|
| 17 |
EXPOSE 8501
|
| 18 |
-
|
| 19 |
-
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 20 |
-
|
| 21 |
-
ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
|
|
|
| 1 |
+
FROM python:3.10
|
| 2 |
+
WORKDIR /src
|
| 3 |
+
COPY . /src
|
| 4 |
+
RUN pip install -r requirements.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
EXPOSE 8501
|
| 6 |
+
CMD ["streamlit", "run", "main/demo/app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
|
|
|
|
|
|
|
|
src/app.py
ADDED
|
@@ -0,0 +1,370 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import time
|
| 3 |
+
import os
|
| 4 |
+
import shutil
|
| 5 |
+
import pymupdf
|
| 6 |
+
import json
|
| 7 |
+
|
| 8 |
+
st.set_page_config(
|
| 9 |
+
page_title="MGVG Grounding Demo",
|
| 10 |
+
layout="wide",
|
| 11 |
+
initial_sidebar_state="expanded",
|
| 12 |
+
page_icon="logo.png"
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
# --- Simple Authentication ---
|
| 16 |
+
import streamlit as st
|
| 17 |
+
import time
|
| 18 |
+
|
| 19 |
+
# Define your valid credentials
|
| 20 |
+
VALID_USERS = {
|
| 21 |
+
"iitb": "iitb123",
|
| 22 |
+
"badri": "badri123"
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
def login():
|
| 26 |
+
# Set a professional background for the whole app
|
| 27 |
+
st.markdown(
|
| 28 |
+
'''
|
| 29 |
+
<style>
|
| 30 |
+
body, .stApp {
|
| 31 |
+
background: linear-gradient(120deg, #e0eafc 0%, #cfdef3 100%) !important;
|
| 32 |
+
}
|
| 33 |
+
.login-box {
|
| 34 |
+
background: #fff;
|
| 35 |
+
padding: 2.5em 2em 2em 2em;
|
| 36 |
+
border-radius: 16px;
|
| 37 |
+
box-shadow: 0 4px 24px rgba(80, 120, 200, 0.12);
|
| 38 |
+
min-width: 320px;
|
| 39 |
+
max-width: 90vw;
|
| 40 |
+
margin: auto;
|
| 41 |
+
}
|
| 42 |
+
</style>
|
| 43 |
+
''', unsafe_allow_html=True
|
| 44 |
+
)
|
| 45 |
+
# Center the login box using columns
|
| 46 |
+
col1, col2, col3 = st.columns([1,2,1])
|
| 47 |
+
with col2:
|
| 48 |
+
# st.markdown('<div class="login-box">', unsafe_allow_html=True)
|
| 49 |
+
# image at center
|
| 50 |
+
st.image("logo.png", width=800, use_container_width=False)
|
| 51 |
+
st.markdown('<h2 style="text-align:center; color:#2b6cb0; margin-bottom:1.5em;">🔒 Please log in to access the app</h2>', unsafe_allow_html=True)
|
| 52 |
+
username = st.text_input("Username", key="login_username")
|
| 53 |
+
password = st.text_input("Password", type="password", key="login_password")
|
| 54 |
+
login_btn = st.button("Login")
|
| 55 |
+
if login_btn:
|
| 56 |
+
if username in VALID_USERS and VALID_USERS[username] == password:
|
| 57 |
+
st.session_state["authenticated"] = True
|
| 58 |
+
st.success("Login successful!")
|
| 59 |
+
st.session_state["show_continue"] = True
|
| 60 |
+
else:
|
| 61 |
+
st.error("Invalid username or password")
|
| 62 |
+
if st.session_state.get("show_continue", False):
|
| 63 |
+
if st.button("Continue to App"):
|
| 64 |
+
st.session_state["show_continue"] = False
|
| 65 |
+
st.experimental_rerun() if hasattr(st, "experimental_rerun") else None
|
| 66 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 67 |
+
|
| 68 |
+
if "authenticated" not in st.session_state:
|
| 69 |
+
st.session_state["authenticated"] = False
|
| 70 |
+
|
| 71 |
+
if not st.session_state["authenticated"]:
|
| 72 |
+
login()
|
| 73 |
+
st.stop()
|
| 74 |
+
# --- End Authentication ---
|
| 75 |
+
|
| 76 |
+
# st.image("logo.png", width=250)
|
| 77 |
+
|
| 78 |
+
from PIL import Image, ImageDraw
|
| 79 |
+
import io
|
| 80 |
+
# from st_audiorec import st_audiorec
|
| 81 |
+
|
| 82 |
+
from surya.layout import LayoutPredictor
|
| 83 |
+
from doctr.models import ocr_predictor
|
| 84 |
+
from transformers import pipeline
|
| 85 |
+
|
| 86 |
+
@st.cache_resource
|
| 87 |
+
def get_layout_predictor():
|
| 88 |
+
return LayoutPredictor()
|
| 89 |
+
|
| 90 |
+
@st.cache_resource
|
| 91 |
+
def get_ocr_model():
|
| 92 |
+
return ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
|
| 93 |
+
|
| 94 |
+
@st.cache_resource
|
| 95 |
+
def get_llm_model(device):
|
| 96 |
+
return pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B-Instruct", device=device)
|
| 97 |
+
|
| 98 |
+
from predict_output import predict_output
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
layout_predictor = get_layout_predictor()
|
| 102 |
+
model = get_ocr_model()
|
| 103 |
+
pipe = get_llm_model("cuda")
|
| 104 |
+
|
| 105 |
+
print("Models loaded")
|
| 106 |
+
|
| 107 |
+
# --- Placeholder function for demo ---
|
| 108 |
+
def get_corresponding_bboxes(image, question):
|
| 109 |
+
# Returns dummy bounding boxes and answer for demo
|
| 110 |
+
# Each bbox: (x1, y1, x2, y2)
|
| 111 |
+
w, h = image.size
|
| 112 |
+
block_bboxes = [(w//8, h//8, w//2, h//2)]
|
| 113 |
+
line_bboxes = [(w//4, h//4, w//2, h//3)]
|
| 114 |
+
word_bboxes = [(w//3, h//3, w//2, h//2)]
|
| 115 |
+
point_bboxes = [(w//2, h//2, w//2+5, h//2+5)]
|
| 116 |
+
answer = "This is a demo answer."
|
| 117 |
+
return block_bboxes, line_bboxes, word_bboxes, point_bboxes, answer
|
| 118 |
+
|
| 119 |
+
# --- Helper to draw bboxes ---
|
| 120 |
+
def draw_bboxes(image, bboxes, color):
|
| 121 |
+
img = image.copy()
|
| 122 |
+
# width proportional to the image size
|
| 123 |
+
width = int(img.width/100)
|
| 124 |
+
draw = ImageDraw.Draw(img)
|
| 125 |
+
for bbox in bboxes:
|
| 126 |
+
draw.rectangle(bbox, outline=color, width=width)
|
| 127 |
+
return img
|
| 128 |
+
|
| 129 |
+
def draw_points(image, bboxes, color):
|
| 130 |
+
img = image.copy()
|
| 131 |
+
width = int(img.width)
|
| 132 |
+
draw = ImageDraw.Draw(img)
|
| 133 |
+
for bbox in bboxes:
|
| 134 |
+
# x1, y1, x2, y2 = bbox
|
| 135 |
+
cx, cy = bbox[0], bbox[1]
|
| 136 |
+
# r being relative to the image size
|
| 137 |
+
r = int(img.width/100)
|
| 138 |
+
draw.ellipse((cx-r, cy-r, cx+r, cy+r), outline=color, width=width, fill=color)
|
| 139 |
+
return img
|
| 140 |
+
|
| 141 |
+
# model_type = st.sidebar.checkbox("Use LLM Model", value=False)
|
| 142 |
+
# model_type = "llm" if model_type else "inhouse"
|
| 143 |
+
|
| 144 |
+
st.markdown("""
|
| 145 |
+
<style>
|
| 146 |
+
.main {
|
| 147 |
+
background: linear-gradient(135deg, #f8fafc 0%, #e0e7ef 100%);
|
| 148 |
+
}
|
| 149 |
+
.block-container {
|
| 150 |
+
padding-top: 2rem;
|
| 151 |
+
padding-bottom: 2rem;
|
| 152 |
+
}
|
| 153 |
+
.stButton>button {
|
| 154 |
+
background-color: #4F8BF9;
|
| 155 |
+
color: white;
|
| 156 |
+
border-radius: 8px;
|
| 157 |
+
font-size: 1.1rem;
|
| 158 |
+
padding: 0.5em 2em;
|
| 159 |
+
}
|
| 160 |
+
.stTextInput>div>input {
|
| 161 |
+
border-radius: 8px;
|
| 162 |
+
border: 1px solid #4F8BF9;
|
| 163 |
+
}
|
| 164 |
+
.stFileUploader>div>div {
|
| 165 |
+
border-radius: 8px;
|
| 166 |
+
border: 2px dashed #4F8BF9;
|
| 167 |
+
}
|
| 168 |
+
.stAudio>audio {
|
| 169 |
+
width: 100% !important;
|
| 170 |
+
}
|
| 171 |
+
</style>
|
| 172 |
+
""", unsafe_allow_html=True)
|
| 173 |
+
|
| 174 |
+
col_logo, col_title = st.columns([1, 8])
|
| 175 |
+
with col_logo:
|
| 176 |
+
st.image("logo.png", width=180)
|
| 177 |
+
with col_title:
|
| 178 |
+
st.markdown("<h1 style='margin-bottom: 0;'>MGVG - Multi-Granular Visual Grounding</h1>", unsafe_allow_html=True)
|
| 179 |
+
|
| 180 |
+
# List of quotes (HTML formatted)
|
| 181 |
+
QUOTES = [
|
| 182 |
+
'''<div style="color: #2b6cb0; font-size: 1.3em; font-weight: 500; margin-bottom: 1em;">
|
| 183 |
+
"प्रत्यक्षं किं प्रमाणं?" <span style="font-size:0.9em; color:#444;">(<i>What better proof is there than direct perception?)</i></span>
|
| 184 |
+
</div>''',
|
| 185 |
+
'''<div style="color: #2b6cb0; font-size: 1.3em; font-weight: 500; margin-bottom: 1em;">
|
| 186 |
+
<i>"Truth is not told—it is seen."</i>
|
| 187 |
+
</div>'''
|
| 188 |
+
]
|
| 189 |
+
|
| 190 |
+
# Initialize session state for quote index and last update time
|
| 191 |
+
if "quote_index" not in st.session_state:
|
| 192 |
+
st.session_state.quote_index = 0
|
| 193 |
+
st.session_state.last_quote_time = time.time()
|
| 194 |
+
|
| 195 |
+
# Check if 5 seconds have passed
|
| 196 |
+
if time.time() - st.session_state.last_quote_time > 5:
|
| 197 |
+
st.session_state.quote_index = (st.session_state.quote_index + 1) % len(QUOTES)
|
| 198 |
+
st.session_state.last_quote_time = time.time()
|
| 199 |
+
# Rerun the app to update the quote
|
| 200 |
+
if hasattr(st, "experimental_rerun"):
|
| 201 |
+
st.experimental_rerun()
|
| 202 |
+
|
| 203 |
+
# Display the current quote
|
| 204 |
+
st.markdown(QUOTES[st.session_state.quote_index], unsafe_allow_html=True)
|
| 205 |
+
|
| 206 |
+
col1, col2 = st.columns([1, 2])
|
| 207 |
+
|
| 208 |
+
with col1:
|
| 209 |
+
st.subheader("1. Upload Image or pdf document")
|
| 210 |
+
image = "Not Uploaded"
|
| 211 |
+
uploaded_file = st.file_uploader("Choose an image", type=["png", "jpg", "jpeg", "pdf"])
|
| 212 |
+
if uploaded_file:
|
| 213 |
+
current_dir = os.getcwd()
|
| 214 |
+
temp_output_folder = os.path.join(current_dir, "temp_output_folder/")
|
| 215 |
+
# delete the temp_output_folder
|
| 216 |
+
if os.path.exists(temp_output_folder):
|
| 217 |
+
shutil.rmtree(temp_output_folder)
|
| 218 |
+
|
| 219 |
+
document_type = "image"
|
| 220 |
+
if uploaded_file.type == "application/pdf":
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
# save the uploaded file to a temp file
|
| 224 |
+
temp_file_path = os.path.join(current_dir, "temp_file.pdf")
|
| 225 |
+
|
| 226 |
+
# delete the temp_file_path
|
| 227 |
+
if os.path.exists(temp_file_path):
|
| 228 |
+
os.remove(temp_file_path)
|
| 229 |
+
|
| 230 |
+
with open(temp_file_path, "wb") as f:
|
| 231 |
+
f.write(uploaded_file.getbuffer())
|
| 232 |
+
|
| 233 |
+
if not os.path.exists(temp_output_folder):
|
| 234 |
+
os.makedirs(temp_output_folder)
|
| 235 |
+
# output_file = simple_counter_generator("page", ".jpg")
|
| 236 |
+
# convert_from_path(document_path, output_folder=temp_output_folder, dpi=300, fmt='jpeg', jpegopt= jpg_options, output_file=output_file)
|
| 237 |
+
|
| 238 |
+
pages = 0
|
| 239 |
+
doc = pymupdf.open(temp_file_path) # open document
|
| 240 |
+
for page in doc: # iterate through the pages
|
| 241 |
+
pages += 1
|
| 242 |
+
pix = page.get_pixmap() # render page to an image
|
| 243 |
+
pix.save(f"{temp_output_folder}/{page.number}.png")
|
| 244 |
+
|
| 245 |
+
if(pages == 1):
|
| 246 |
+
document_type = "image"
|
| 247 |
+
document_path = os.path.join(temp_output_folder, "0.png")
|
| 248 |
+
uploaded_file = os.path.join(temp_output_folder, "0.png")
|
| 249 |
+
image = Image.open(uploaded_file).convert("RGB")
|
| 250 |
+
else:
|
| 251 |
+
document_type = "pdf"
|
| 252 |
+
# image = Image.open(uploaded_file).convert("RGB")
|
| 253 |
+
|
| 254 |
+
if document_type == "image":
|
| 255 |
+
image = Image.open(uploaded_file).convert("RGB")
|
| 256 |
+
st.image(image, caption="Uploaded Image", use_container_width=True)
|
| 257 |
+
# Save uploaded image to a temp file for predict_output
|
| 258 |
+
temp_file_path = "sample.png"
|
| 259 |
+
image.save(temp_file_path)
|
| 260 |
+
else:
|
| 261 |
+
document_type = "pdf"
|
| 262 |
+
document_path = uploaded_file.name
|
| 263 |
+
image = "Uploaded PDF"
|
| 264 |
+
# st.image(uploaded_file, caption="Uploaded PDF", use_container_width=True)
|
| 265 |
+
else:
|
| 266 |
+
image = "Not Uploaded"
|
| 267 |
+
temp_output_folder = None
|
| 268 |
+
st.image("https://placehold.co/400x300?text=Upload+Image", caption="Uploaded Image", use_container_width=True)
|
| 269 |
+
|
| 270 |
+
st.subheader("2. Ask a question")
|
| 271 |
+
question = st.text_input("Type your question here")
|
| 272 |
+
|
| 273 |
+
# Add radio button for model selection
|
| 274 |
+
model_type = st.radio(
|
| 275 |
+
"Select Model Type:",
|
| 276 |
+
options=["MGVG", "IndoDocs"],
|
| 277 |
+
index=1,
|
| 278 |
+
horizontal=True
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
run_demo = st.button("Run Grounding Demo", use_container_width=True)
|
| 282 |
+
|
| 283 |
+
# --- Output placeholders ---
|
| 284 |
+
with col2:
|
| 285 |
+
st.subheader("3. Visual Grounding Outputs")
|
| 286 |
+
if image!="Not Uploaded" and (question):
|
| 287 |
+
print(image)
|
| 288 |
+
print(question)
|
| 289 |
+
if run_demo and image!="Not Uploaded" and (question):
|
| 290 |
+
# Use text input only
|
| 291 |
+
q = question
|
| 292 |
+
answer, block_bboxes, line_bboxes, word_bboxes, point_bboxes, current_page = predict_output(
|
| 293 |
+
temp_file_path, q, pipe, layout_predictor, model, model_type, document_type
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
# print(block_bboxes)
|
| 298 |
+
# print(line_bboxes)
|
| 299 |
+
# print(word_bboxes)
|
| 300 |
+
# print(point_bboxes)
|
| 301 |
+
print(answer)
|
| 302 |
+
|
| 303 |
+
if(current_page != -1):
|
| 304 |
+
image = Image.open(os.path.join(temp_output_folder, f"{current_page}.png")).convert("RGB")
|
| 305 |
+
print("--------------------------------")
|
| 306 |
+
print(image)
|
| 307 |
+
|
| 308 |
+
block_img = draw_bboxes(image, block_bboxes, color="#4F8BF9")
|
| 309 |
+
line_img = draw_bboxes(image, line_bboxes, color="#F97B4F")
|
| 310 |
+
word_img = draw_bboxes(image, word_bboxes, color="#4FF9B2")
|
| 311 |
+
point_img = draw_points(image, point_bboxes, color="#FFFF00")
|
| 312 |
+
imgs = [block_img, line_img, word_img, point_img]
|
| 313 |
+
labels = ["Block Level", "Line Level", "Word Level", "Point Level"]
|
| 314 |
+
cols = st.columns(4)
|
| 315 |
+
for i, (img, label) in enumerate(zip(imgs, labels)):
|
| 316 |
+
with cols[i]:
|
| 317 |
+
st.image(img, caption=label, use_container_width=True)
|
| 318 |
+
answer_lines = answer.splitlines()
|
| 319 |
+
st.markdown("""
|
| 320 |
+
<div style='background: #f1f5fa; border-radius: 10px; padding: 1em 2em; border: 1.5px solid #4F8BF9;'>
|
| 321 |
+
<h4 style='color: #4F8BF9;'>Predicted Answer:</h4>
|
| 322 |
+
<p style='font-size: 1.2em; color: #222;'>""" + "<br>".join(answer_lines) + """</p>
|
| 323 |
+
</div>
|
| 324 |
+
""", unsafe_allow_html=True)
|
| 325 |
+
|
| 326 |
+
# --- Centered Save Results Button ---
|
| 327 |
+
result_data = {
|
| 328 |
+
"question": q,
|
| 329 |
+
"answer": answer,
|
| 330 |
+
"block_bboxes": block_bboxes,
|
| 331 |
+
"line_bboxes": line_bboxes,
|
| 332 |
+
"word_bboxes": word_bboxes,
|
| 333 |
+
"point_bboxes": point_bboxes,
|
| 334 |
+
"current_page": current_page
|
| 335 |
+
}
|
| 336 |
+
json_str = json.dumps(result_data, indent=2)
|
| 337 |
+
col_left, col_center, col_right = st.columns([2, 3, 2])
|
| 338 |
+
with col_center:
|
| 339 |
+
st.download_button(
|
| 340 |
+
label="Save Results as JSON",
|
| 341 |
+
data=json_str,
|
| 342 |
+
file_name="grounding_results.json",
|
| 343 |
+
mime="application/json"
|
| 344 |
+
)
|
| 345 |
+
else:
|
| 346 |
+
st.markdown("""
|
| 347 |
+
<div style='display: flex; gap: 2em; flex-wrap: wrap;'>
|
| 348 |
+
<div style='flex: 1; min-width: 220px;'>
|
| 349 |
+
<img src='https://placehold.co/220x180?text=Block+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
|
| 350 |
+
<p style='text-align:center; font-weight:600;'>Block Level</p>
|
| 351 |
+
</div>
|
| 352 |
+
<div style='flex: 1; min-width: 220px;'>
|
| 353 |
+
<img src='https://placehold.co/220x180?text=Line+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
|
| 354 |
+
<p style='text-align:center; font-weight:600;'>Line Level</p>
|
| 355 |
+
</div>
|
| 356 |
+
<div style='flex: 1; min-width: 220px;'>
|
| 357 |
+
<img src='https://placehold.co/220x180?text=Word+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
|
| 358 |
+
<p style='text-align:center; font-weight:600;'>Word Level</p>
|
| 359 |
+
</div>
|
| 360 |
+
<div style='flex: 1; min-width: 220px;'>
|
| 361 |
+
<img src='https://placehold.co/220x180?text=Point+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
|
| 362 |
+
<p style='text-align:center; font-weight:600;'>Point Level</p>
|
| 363 |
+
</div>
|
| 364 |
+
</div>
|
| 365 |
+
<br>
|
| 366 |
+
<div style='background: #f1f5fa; border-radius: 10px; padding: 1em 2em; border: 1.5px solid #4F8BF9;'>
|
| 367 |
+
<h4 style='color: #4F8BF9;'>Predicted Answer:</h4>
|
| 368 |
+
<p style='font-size: 1.2em; color: #222;'>[Answer will appear here]</p>
|
| 369 |
+
</div>
|
| 370 |
+
""", unsafe_allow_html=True)
|
src/config.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL1 = "MGVG"
|
| 2 |
+
MODEL2 = "IndoDocs"
|
src/predict_output.py
ADDED
|
@@ -0,0 +1,559 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
from fuzzywuzzy import fuzz
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import requests
|
| 7 |
+
# from surya.layout import LayoutPredictor
|
| 8 |
+
|
| 9 |
+
from doctr.io import DocumentFile
|
| 10 |
+
from pdf2image import convert_from_path
|
| 11 |
+
import pymupdf
|
| 12 |
+
# from doctr.models import ocr_predictor
|
| 13 |
+
import numpy as np
|
| 14 |
+
from time import time
|
| 15 |
+
|
| 16 |
+
pipe = None
|
| 17 |
+
layout_predictor = None
|
| 18 |
+
|
| 19 |
+
MAX_BLOCK_MATCHES = 2
|
| 20 |
+
MAX_LINE_MATCHES = 5
|
| 21 |
+
CUT_OFF_THRESHOLD = 60
|
| 22 |
+
QUESTION_WEIGHT = 0.2
|
| 23 |
+
ANSWER_WEIGHT = 0.8
|
| 24 |
+
LEVEL = "line"
|
| 25 |
+
|
| 26 |
+
jpg_options = {
|
| 27 |
+
"quality" : 100,
|
| 28 |
+
"progressive": True,
|
| 29 |
+
"optimize" : False
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
stop_words = {'what', 'is', 'the', 'this', 'that', 'these', 'those', 'which', 'how', 'why', 'where', 'when', 'who', 'will', 'be', 'and', 'or', 'in', 'at', 'to', 'for', 'of', 'with', 'by'}
|
| 33 |
+
|
| 34 |
+
def longest_consecutive_range(indices):
|
| 35 |
+
if not indices:
|
| 36 |
+
return []
|
| 37 |
+
|
| 38 |
+
indices = sorted(set(indices))
|
| 39 |
+
longest = []
|
| 40 |
+
current = [indices[0]]
|
| 41 |
+
|
| 42 |
+
for i in range(1, len(indices)):
|
| 43 |
+
if indices[i] == indices[i - 1] + 1:
|
| 44 |
+
current.append(indices[i])
|
| 45 |
+
else:
|
| 46 |
+
if len(current) > len(longest):
|
| 47 |
+
longest = current
|
| 48 |
+
current = [indices[i]]
|
| 49 |
+
|
| 50 |
+
if len(current) > len(longest):
|
| 51 |
+
longest = current
|
| 52 |
+
|
| 53 |
+
return longest
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def get_word_level_matches(answer_text, top_k_matches):
|
| 57 |
+
bboxes = []
|
| 58 |
+
for match in top_k_matches:
|
| 59 |
+
indices = []
|
| 60 |
+
for index, word in enumerate(match['words']):
|
| 61 |
+
if word['text'].lower() in answer_text.lower():
|
| 62 |
+
# bboxes.append(word['bbox'])
|
| 63 |
+
indices.append(index)
|
| 64 |
+
longest_indices = longest_consecutive_range(indices)
|
| 65 |
+
for index in longest_indices:
|
| 66 |
+
bboxes.append(match['words'][index]['bbox'])
|
| 67 |
+
return bboxes
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def get_matched_regions(question_text, target_text, predictions, level):
|
| 71 |
+
|
| 72 |
+
question_terms = [word.lower() for word in question_text.split() if word.lower() not in stop_words]
|
| 73 |
+
matched_regions = []
|
| 74 |
+
for region in predictions:
|
| 75 |
+
region_text = region['text']
|
| 76 |
+
region_copy = region.copy()
|
| 77 |
+
|
| 78 |
+
if target_text.lower() in region_text.lower():
|
| 79 |
+
region_copy['match_score'] = 100
|
| 80 |
+
region_copy['match_details'] = {
|
| 81 |
+
'exact_match': True,
|
| 82 |
+
'answer_score': 100,
|
| 83 |
+
'question_score': 100
|
| 84 |
+
}
|
| 85 |
+
matched_regions.append(region_copy)
|
| 86 |
+
continue
|
| 87 |
+
|
| 88 |
+
partial_score = fuzz.partial_ratio(target_text.lower(), region_text.lower())
|
| 89 |
+
token_score = fuzz.token_set_ratio(target_text.lower(), region_text.lower())
|
| 90 |
+
|
| 91 |
+
# Calculate length factor (preference for longer matches that contain meaningful content)
|
| 92 |
+
target_len = len(target_text)
|
| 93 |
+
region_len = len(region_text)
|
| 94 |
+
length_factor = min(1.0, region_len / min(50, target_len)) # Cap at 1.0, adapt based on target length
|
| 95 |
+
|
| 96 |
+
# Combine scores for answer with weights
|
| 97 |
+
# Higher weight to token matching for longer texts, higher weight to partial matching for shorter texts
|
| 98 |
+
if region_len > 10:
|
| 99 |
+
answer_score = (partial_score * 0.3) + (token_score * 0.5) + (length_factor * 100 * 0.2)
|
| 100 |
+
else:
|
| 101 |
+
# For very short texts, reduce their overall score unless they're exact matches
|
| 102 |
+
answer_score = (partial_score * 0.3) + (token_score * 0.4) + (length_factor * 100 * 0.3)
|
| 103 |
+
if region_len < 5 and partial_score < 100:
|
| 104 |
+
answer_score *= 0.5 # Penalize very short inexact matches
|
| 105 |
+
|
| 106 |
+
# penalize shorter region_texts
|
| 107 |
+
if region_len < 5:
|
| 108 |
+
answer_score *= 0.5
|
| 109 |
+
|
| 110 |
+
# Calculate fuzzy match scores for question terms using both methods
|
| 111 |
+
partial_question_scores = [fuzz.partial_ratio(term, region_text.lower()) for term in question_terms]
|
| 112 |
+
token_question_scores = [fuzz.token_set_ratio(term, region_text.lower()) for term in question_terms]
|
| 113 |
+
|
| 114 |
+
# Get best scores for question terms
|
| 115 |
+
best_partial_question = max(partial_question_scores) if partial_question_scores else 0
|
| 116 |
+
best_token_question = max(token_question_scores) if token_question_scores else 0
|
| 117 |
+
|
| 118 |
+
# Combine question scores
|
| 119 |
+
question_score = (best_partial_question * 0.4) + (best_token_question * 0.6)
|
| 120 |
+
|
| 121 |
+
# Combine scores (giving more weight to answer matches)
|
| 122 |
+
combined_score = (answer_score * ANSWER_WEIGHT) + (question_score * QUESTION_WEIGHT)
|
| 123 |
+
|
| 124 |
+
# print(combined_score)
|
| 125 |
+
|
| 126 |
+
if combined_score >= CUT_OFF_THRESHOLD:
|
| 127 |
+
region_copy['match_score'] = combined_score
|
| 128 |
+
region_copy['match_details'] = {
|
| 129 |
+
'exact_match': False,
|
| 130 |
+
'answer_score': answer_score,
|
| 131 |
+
'question_score': question_score,
|
| 132 |
+
'answer_weight': ANSWER_WEIGHT,
|
| 133 |
+
'question_weight': QUESTION_WEIGHT
|
| 134 |
+
}
|
| 135 |
+
matched_regions.append(region_copy)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
matched_regions.sort(key=lambda x: x['match_score'], reverse=True)
|
| 139 |
+
|
| 140 |
+
# If no matches, reduce threshold by 20 and get the topmost single output
|
| 141 |
+
if not matched_regions:
|
| 142 |
+
new_threshold = max(CUT_OFF_THRESHOLD - 20, 0) # Prevent negative threshold
|
| 143 |
+
matched_regions = [region for region in matched_regions if region['match_score'] >= new_threshold]
|
| 144 |
+
matched_regions.sort(key=lambda x: x['match_score'], reverse=True)
|
| 145 |
+
if matched_regions:
|
| 146 |
+
matched_regions = [matched_regions[0]] # Only keep the topmost single output
|
| 147 |
+
|
| 148 |
+
if level == "block":
|
| 149 |
+
top_matches = matched_regions[:MAX_BLOCK_MATCHES]
|
| 150 |
+
elif level == "line":
|
| 151 |
+
top_matches = matched_regions[:MAX_LINE_MATCHES]
|
| 152 |
+
return top_matches
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def get_processed_text_for_llm(block_predictions, gap):
|
| 156 |
+
final_text = ""
|
| 157 |
+
for block_data in block_predictions:
|
| 158 |
+
final_text += block_data['text'] + gap
|
| 159 |
+
return final_text
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def get_page_number(block_bboxes):
|
| 163 |
+
pages = {}
|
| 164 |
+
for block in block_bboxes:
|
| 165 |
+
if block['page'] not in pages:
|
| 166 |
+
pages[block['page']] = 1
|
| 167 |
+
else:
|
| 168 |
+
pages[block['page']] += 1
|
| 169 |
+
|
| 170 |
+
print(pages)
|
| 171 |
+
max_page = max(pages, key=pages.get)
|
| 172 |
+
return max_page
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def predict_output(document_path, question, pipe, layout_predictor, model, model_type, document_type="image"):
|
| 176 |
+
|
| 177 |
+
predicted_answer = None
|
| 178 |
+
block_box_predictions = None
|
| 179 |
+
line_box_predictions = None
|
| 180 |
+
word_box_predictions = None
|
| 181 |
+
point_box_predictions = None
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
curr_time = time()
|
| 185 |
+
line_predictions, pages_count = get_line_predictions(document_path, model, document_type)
|
| 186 |
+
line_time = time()
|
| 187 |
+
print(f"Done with line predictions in {line_time - curr_time} seconds")
|
| 188 |
+
curr_time = time()
|
| 189 |
+
if(document_type == "pdf" and pages_count < 3):
|
| 190 |
+
block_predictions = get_block_predictions(document_path, layout_predictor, model, document_type)
|
| 191 |
+
gap = '\n\n\n'
|
| 192 |
+
else:
|
| 193 |
+
block_predictions = line_predictions
|
| 194 |
+
gap = '\n'
|
| 195 |
+
block_time = time()
|
| 196 |
+
print(f"Done with block predictions in {block_time - line_time} seconds")
|
| 197 |
+
# exit()
|
| 198 |
+
|
| 199 |
+
# print(line_predictions)
|
| 200 |
+
# print(block_predictions)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
curr_time = time()
|
| 204 |
+
if model_type == "MGVG" or document_type=="pdf":
|
| 205 |
+
processed_text_for_llm = get_processed_text_for_llm(block_predictions, gap=gap)
|
| 206 |
+
# print("Processed Text for LLM: ", processed_text_for_llm)
|
| 207 |
+
predicted_answer = generate_llm_answer(question, processed_text_for_llm, pipe)
|
| 208 |
+
|
| 209 |
+
elif model_type == "IndoDocs":
|
| 210 |
+
predicted_answer = generate_via_inhouse_model_answer(question, document_path)
|
| 211 |
+
llm_time = time()
|
| 212 |
+
print(f"Done with LLM in {llm_time - curr_time} seconds")
|
| 213 |
+
|
| 214 |
+
print("LLM Answer: ", predicted_answer)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
total_algo_time = time()
|
| 218 |
+
|
| 219 |
+
# print(predicted_answer)
|
| 220 |
+
curr_time = time()
|
| 221 |
+
|
| 222 |
+
line_matches = get_matched_regions(question, predicted_answer, line_predictions, "line")
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
block_bboxes = get_matched_regions(question, predicted_answer, block_predictions, "block")
|
| 226 |
+
match_time = time()
|
| 227 |
+
print(f"Done with match in {match_time - curr_time} seconds")
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
if document_type == "pdf":
|
| 231 |
+
current_page = get_page_number(block_bboxes)
|
| 232 |
+
else:
|
| 233 |
+
current_page = -1
|
| 234 |
+
|
| 235 |
+
if(current_page != -1):
|
| 236 |
+
predicted_answer = "Answer predicted from page: " + str(current_page+1) + "\n" + predicted_answer
|
| 237 |
+
|
| 238 |
+
block_box_predictions = []
|
| 239 |
+
for match in block_bboxes:
|
| 240 |
+
block_box_predictions.append(match['bbox'])
|
| 241 |
+
|
| 242 |
+
line_box_predictions = []
|
| 243 |
+
for match in line_matches:
|
| 244 |
+
# print(match['page'], match['bbox'])
|
| 245 |
+
if current_page == -1 or match['page'] == current_page:
|
| 246 |
+
line_box_predictions.append(match['bbox'])
|
| 247 |
+
|
| 248 |
+
# for line in line_box_predictions:
|
| 249 |
+
# print(line)
|
| 250 |
+
|
| 251 |
+
curr_time = time()
|
| 252 |
+
word_box_predictions = get_word_level_matches(predicted_answer, top_k_matches=line_matches)
|
| 253 |
+
word_time = time()
|
| 254 |
+
print(f"Done with word in {word_time - curr_time} seconds")
|
| 255 |
+
|
| 256 |
+
curr_time = time()
|
| 257 |
+
point_box_predictions = get_point_level_matches(block_box_predictions, line_box_predictions, word_box_predictions)
|
| 258 |
+
point_time = time()
|
| 259 |
+
print(f"Done with point in {point_time - curr_time} seconds")
|
| 260 |
+
|
| 261 |
+
print(f"Total algo time: {time() - total_algo_time} seconds")
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
# print(block_box_predictions)
|
| 265 |
+
# print(line_box_predictions)
|
| 266 |
+
# print(word_box_predictions)
|
| 267 |
+
# print(point_box_predictions)
|
| 268 |
+
# print(predicted_answer)
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
return predicted_answer, block_box_predictions, line_box_predictions, word_box_predictions, point_box_predictions, current_page
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def calculate_midpoint_of_bboxes(bboxes):
|
| 275 |
+
|
| 276 |
+
if not bboxes:
|
| 277 |
+
return None
|
| 278 |
+
|
| 279 |
+
# Convert to numpy array for easier manipulation
|
| 280 |
+
bboxes = np.array(bboxes)
|
| 281 |
+
|
| 282 |
+
# Find the extreme points of all bboxes combined
|
| 283 |
+
min_x = np.min(bboxes[:, 0])
|
| 284 |
+
min_y = np.min(bboxes[:, 1])
|
| 285 |
+
max_x = np.max(bboxes[:, 2])
|
| 286 |
+
max_y = np.max(bboxes[:, 3])
|
| 287 |
+
|
| 288 |
+
# Calculate midpoint
|
| 289 |
+
midpoint_x = (min_x + max_x) / 2
|
| 290 |
+
midpoint_y = (min_y + max_y) / 2
|
| 291 |
+
|
| 292 |
+
return round(midpoint_x, 2), round(midpoint_y, 2)
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def get_point_level_matches(block_box_predictions, line_box_predictions, word_box_predictions):
|
| 296 |
+
|
| 297 |
+
point_box_predictions = []
|
| 298 |
+
|
| 299 |
+
if len(block_box_predictions) ==1:
|
| 300 |
+
try:
|
| 301 |
+
x, y = calculate_midpoint_of_bboxes(block_box_predictions)
|
| 302 |
+
point_box_predictions = [[x, y]]
|
| 303 |
+
# print(x, y)
|
| 304 |
+
except:
|
| 305 |
+
try:
|
| 306 |
+
x, y = calculate_midpoint_of_bboxes(line_box_predictions)
|
| 307 |
+
point_box_predictions = [[x, y]]
|
| 308 |
+
except:
|
| 309 |
+
point_box_predictions = []
|
| 310 |
+
else:
|
| 311 |
+
points = []
|
| 312 |
+
for block_bbox in block_box_predictions:
|
| 313 |
+
try:
|
| 314 |
+
x, y = calculate_midpoint_of_bboxes(block_bbox)
|
| 315 |
+
points.append([x, y])
|
| 316 |
+
except:
|
| 317 |
+
continue
|
| 318 |
+
point_box_predictions = points
|
| 319 |
+
|
| 320 |
+
return point_box_predictions
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
def generate_via_inhouse_model_answer(question, image_path, api_key="VISION-TEAM", max_tokens=512, temperature=0.7, endpoint="http://103.207.148.38:9000/api/v1/chat/upload"):
|
| 324 |
+
headers = {
|
| 325 |
+
"x-api-key": api_key # or whatever the Swagger UI says
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
files = {
|
| 329 |
+
"image": open(image_path, "rb")
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
data = {
|
| 333 |
+
"text": question,
|
| 334 |
+
"max_tokens": str(max_tokens),
|
| 335 |
+
"temperature": str(temperature)
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
try:
|
| 339 |
+
response = requests.post(endpoint, headers=headers, files=files, data=data)
|
| 340 |
+
response.raise_for_status()
|
| 341 |
+
result = response.json()
|
| 342 |
+
except requests.exceptions.RequestException as e:
|
| 343 |
+
return {"error": str(e)}
|
| 344 |
+
|
| 345 |
+
return result['response']['choices'][0]['message']['content']
|
| 346 |
+
|
| 347 |
+
def generate_llm_answer(question, context, pipe):
|
| 348 |
+
|
| 349 |
+
prompt = f"""You are given a question and context. Your task is to find and return the best possible answer to the question using only the context as it is.
|
| 350 |
+
Do not generate summaries, paraphrased content, or any additional explanations including any preamble and postamble.
|
| 351 |
+
Return only the exact phrase or sentence fragment from the context that answers the question.
|
| 352 |
+
If the answer is not found in the context, return: Answer not found in context.
|
| 353 |
+
|
| 354 |
+
Question: {question}
|
| 355 |
+
Context: {context}
|
| 356 |
+
Answer:
|
| 357 |
+
"""
|
| 358 |
+
|
| 359 |
+
messages = [ {"role": "user", "content": prompt}]
|
| 360 |
+
result = pipe(messages, max_new_tokens=512, do_sample=True, temperature=0.7)
|
| 361 |
+
ans = result[0]["generated_text"][1]['content']
|
| 362 |
+
return ans
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
def get_line_predictions(document_path, model, document_type):
|
| 366 |
+
|
| 367 |
+
current_dir = os.getcwd()
|
| 368 |
+
if document_type == "pdf":
|
| 369 |
+
output_file = simple_counter_generator("page", ".jpg")
|
| 370 |
+
current_dir = os.getcwd()
|
| 371 |
+
temp_output_folder = os.path.join(current_dir, "temp_output_folder/")
|
| 372 |
+
|
| 373 |
+
# delete the temp_output_folder
|
| 374 |
+
if os.path.exists(temp_output_folder):
|
| 375 |
+
shutil.rmtree(temp_output_folder)
|
| 376 |
+
|
| 377 |
+
if not os.path.exists(temp_output_folder):
|
| 378 |
+
os.makedirs(temp_output_folder)
|
| 379 |
+
# output_file = simple_counter_generator("page", ".jpg")
|
| 380 |
+
# convert_from_path(document_path, output_folder=temp_output_folder, dpi=300, fmt='jpeg', jpegopt= jpg_options, output_file=output_file)
|
| 381 |
+
|
| 382 |
+
doc = pymupdf.open(document_path) # open document
|
| 383 |
+
for page in doc: # iterate through the pages
|
| 384 |
+
pix = page.get_pixmap() # render page to an image
|
| 385 |
+
pix.save(f"{temp_output_folder}/{page.number}.png")
|
| 386 |
+
|
| 387 |
+
images_path = sorted(os.listdir(temp_output_folder))
|
| 388 |
+
else:
|
| 389 |
+
|
| 390 |
+
images_path = [os.path.join(current_dir, document_path)]
|
| 391 |
+
print(images_path)
|
| 392 |
+
|
| 393 |
+
block_predictions = []
|
| 394 |
+
# print(document_path)
|
| 395 |
+
# if document_type == "pdf":
|
| 396 |
+
# doc = DocumentFile.from_pdf(document_path)
|
| 397 |
+
# else:
|
| 398 |
+
# doc = DocumentFile.from_images(document_path)
|
| 399 |
+
# result = model(doc)
|
| 400 |
+
|
| 401 |
+
line_predictions = []
|
| 402 |
+
|
| 403 |
+
pages_count = -1
|
| 404 |
+
for image_path in images_path:
|
| 405 |
+
pages_count += 1
|
| 406 |
+
|
| 407 |
+
if(len(images_path) > 1):
|
| 408 |
+
doc = DocumentFile.from_images(os.path.join(temp_output_folder, image_path))
|
| 409 |
+
else:
|
| 410 |
+
doc = DocumentFile.from_images(image_path)
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
result = model(doc)
|
| 414 |
+
for page in result.pages:
|
| 415 |
+
dim = tuple(reversed(page.dimensions))
|
| 416 |
+
for block in page.blocks:
|
| 417 |
+
for line in block.lines:
|
| 418 |
+
output = {}
|
| 419 |
+
geo = line.geometry
|
| 420 |
+
a = list(a*b for a,b in zip(geo[0],dim))
|
| 421 |
+
b = list(a*b for a,b in zip(geo[1],dim))
|
| 422 |
+
x1 = round(a[0], 2).astype(float)
|
| 423 |
+
y1 = round(a[1], 2).astype(float)
|
| 424 |
+
x2 = round(b[0], 2).astype(float)
|
| 425 |
+
y2 = round(b[1], 2).astype(float)
|
| 426 |
+
line_bbox = [x1, y1, x2, y2]
|
| 427 |
+
|
| 428 |
+
sent = []
|
| 429 |
+
words_data = []
|
| 430 |
+
for word in line.words:
|
| 431 |
+
word_data = {}
|
| 432 |
+
sent.append(word.value)
|
| 433 |
+
geo = word.geometry
|
| 434 |
+
a = list(a*b for a,b in zip(geo[0],dim))
|
| 435 |
+
b = list(a*b for a,b in zip(geo[1],dim))
|
| 436 |
+
x1 = round(a[0], 2).astype(float)
|
| 437 |
+
y1 = round(a[1], 2).astype(float)
|
| 438 |
+
x2 = round(b[0], 2).astype(float)
|
| 439 |
+
y2 = round(b[1], 2).astype(float)
|
| 440 |
+
bbox = [x1, y1, x2, y2]
|
| 441 |
+
|
| 442 |
+
word_data['bbox'] = bbox
|
| 443 |
+
word_data['text'] = word.value
|
| 444 |
+
words_data.append(word_data)
|
| 445 |
+
output['bbox'] = line_bbox
|
| 446 |
+
output['text'] = " ".join(sent)
|
| 447 |
+
output['words'] = words_data
|
| 448 |
+
output['page'] = pages_count
|
| 449 |
+
line_predictions.append(output)
|
| 450 |
+
|
| 451 |
+
return line_predictions, pages_count
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
def get_block_predictions(document_path, layout_predictor, model, document_type):
|
| 455 |
+
current_dir = os.getcwd()
|
| 456 |
+
if document_type == "pdf":
|
| 457 |
+
output_file = simple_counter_generator("page", ".jpg")
|
| 458 |
+
current_dir = os.getcwd()
|
| 459 |
+
temp_output_folder = os.path.join(current_dir, "temp_output_folder/")
|
| 460 |
+
|
| 461 |
+
# delete the temp_output_folder
|
| 462 |
+
if os.path.exists(temp_output_folder):
|
| 463 |
+
shutil.rmtree(temp_output_folder)
|
| 464 |
+
|
| 465 |
+
if not os.path.exists(temp_output_folder):
|
| 466 |
+
os.makedirs(temp_output_folder)
|
| 467 |
+
# output_file = simple_counter_generator("page", ".jpg")
|
| 468 |
+
# convert_from_path(document_path, output_folder=temp_output_folder, dpi=300, fmt='jpeg', jpegopt= jpg_options, output_file=output_file)
|
| 469 |
+
|
| 470 |
+
doc = pymupdf.open(document_path) # open document
|
| 471 |
+
for page in doc: # iterate through the pages
|
| 472 |
+
pix = page.get_pixmap() # render page to an image
|
| 473 |
+
pix.save(f"{temp_output_folder}/{page.number}.png")
|
| 474 |
+
|
| 475 |
+
images_path = sorted(os.listdir(temp_output_folder))
|
| 476 |
+
else:
|
| 477 |
+
|
| 478 |
+
images_path = [os.path.join(current_dir, document_path)]
|
| 479 |
+
# print(images_path)
|
| 480 |
+
|
| 481 |
+
block_predictions = []
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
|
| 485 |
+
page_count = -1
|
| 486 |
+
for image_path in images_path:
|
| 487 |
+
page_count += 1
|
| 488 |
+
|
| 489 |
+
if(len(images_path) > 1):
|
| 490 |
+
image = Image.open(os.path.join(temp_output_folder, image_path))
|
| 491 |
+
else:
|
| 492 |
+
image = Image.open(os.path.join(current_dir, document_path))
|
| 493 |
+
|
| 494 |
+
# print(image_path)
|
| 495 |
+
# print(image)
|
| 496 |
+
|
| 497 |
+
layout_predictions = layout_predictor([image])
|
| 498 |
+
|
| 499 |
+
for block in layout_predictions[0].bboxes:
|
| 500 |
+
output = {}
|
| 501 |
+
bbox = [int(x) for x in block.bbox]
|
| 502 |
+
|
| 503 |
+
|
| 504 |
+
cropped_image = image.crop(bbox)
|
| 505 |
+
|
| 506 |
+
cropped_image.save(f'temp.png')
|
| 507 |
+
doc = DocumentFile.from_images('temp.png')
|
| 508 |
+
result = model(doc)
|
| 509 |
+
|
| 510 |
+
text = []
|
| 511 |
+
for page in result.pages:
|
| 512 |
+
for block in page.blocks:
|
| 513 |
+
for line in block.lines:
|
| 514 |
+
for word in line.words:
|
| 515 |
+
text.append(word.value)
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
output['bbox'] = bbox
|
| 519 |
+
output['text'] = " ".join(text)
|
| 520 |
+
output['page'] = page_count
|
| 521 |
+
block_predictions.append(output)
|
| 522 |
+
|
| 523 |
+
return block_predictions
|
| 524 |
+
|
| 525 |
+
def simple_counter_generator(prefix="", suffix=""):
|
| 526 |
+
while True:
|
| 527 |
+
yield 'p'
|
| 528 |
+
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
# from doctr.models import ocr_predictor
|
| 532 |
+
# model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
# # from transformers import pipeline
|
| 536 |
+
# # def load_llm_model(device):
|
| 537 |
+
# # pipe = pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B-Instruct", device=device)
|
| 538 |
+
# # return pipe
|
| 539 |
+
|
| 540 |
+
# # pipe = load_llm_model("cuda")
|
| 541 |
+
# pipe = None
|
| 542 |
+
|
| 543 |
+
# # from surya.layout import LayoutPredictor
|
| 544 |
+
# # layout_predictor = LayoutPredictor()
|
| 545 |
+
# layout_predictor = None
|
| 546 |
+
|
| 547 |
+
# document_path = "sample.pdf"
|
| 548 |
+
# question = "What is the subject of the circular?"
|
| 549 |
+
|
| 550 |
+
# answer, block_box_predictions, line_box_predictions, word_box_predictions, point_box_predictions = predict_output(document_path, question, pipe, layout_predictor, model, "Inhouse", document_type="pdf")
|
| 551 |
+
|
| 552 |
+
# print(answer)
|
| 553 |
+
# print(block_box_predictions)
|
| 554 |
+
# print(line_box_predictions)
|
| 555 |
+
# print(word_box_predictions)
|
| 556 |
+
# print(point_box_predictions)
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
|
src/requirements.txt
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accelerate==1.6.0
|
| 2 |
+
aiohappyeyeballs==2.6.1
|
| 3 |
+
aiohttp==3.11.18
|
| 4 |
+
aiosignal==1.3.2
|
| 5 |
+
albucore==0.0.23
|
| 6 |
+
albumentations==2.0.5
|
| 7 |
+
altair==5.5.0
|
| 8 |
+
annotated-types==0.7.0
|
| 9 |
+
anthropic==0.46.0
|
| 10 |
+
anyascii==0.3.2
|
| 11 |
+
anyio==4.9.0
|
| 12 |
+
asttokens==3.0.0
|
| 13 |
+
async-timeout==5.0.1
|
| 14 |
+
attrs==25.3.0
|
| 15 |
+
av==14.3.0
|
| 16 |
+
beautifulsoup4==4.13.4
|
| 17 |
+
blinker==1.9.0
|
| 18 |
+
cachetools==5.5.2
|
| 19 |
+
certifi==2025.1.31
|
| 20 |
+
cfgv==3.4.0
|
| 21 |
+
charset-normalizer==3.4.1
|
| 22 |
+
click==8.1.8
|
| 23 |
+
comm==0.2.2
|
| 24 |
+
contourpy==1.3.1
|
| 25 |
+
cycler==0.12.1
|
| 26 |
+
datasets==3.5.0
|
| 27 |
+
debugpy==1.8.14
|
| 28 |
+
decorator==5.2.1
|
| 29 |
+
defusedxml==0.7.1
|
| 30 |
+
dill==0.3.8
|
| 31 |
+
distlib==0.3.9
|
| 32 |
+
distro==1.9.0
|
| 33 |
+
doclayout_yolo==0.0.3
|
| 34 |
+
exceptiongroup==1.2.2
|
| 35 |
+
executing==2.2.0
|
| 36 |
+
filelock==3.18.0
|
| 37 |
+
filetype==1.2.0
|
| 38 |
+
fonttools==4.57.0
|
| 39 |
+
frozenlist==1.6.0
|
| 40 |
+
fsspec==2024.12.0
|
| 41 |
+
ftfy==6.3.1
|
| 42 |
+
fuzzywuzzy==0.18.0
|
| 43 |
+
gitdb==4.0.12
|
| 44 |
+
GitPython==3.1.44
|
| 45 |
+
google-auth==2.39.0
|
| 46 |
+
google-genai==1.11.0
|
| 47 |
+
h11==0.14.0
|
| 48 |
+
h5py==3.13.0
|
| 49 |
+
httpcore==1.0.8
|
| 50 |
+
httpx==0.28.1
|
| 51 |
+
huggingface-hub==0.30.2
|
| 52 |
+
identify==2.6.10
|
| 53 |
+
idna==3.10
|
| 54 |
+
ipykernel==6.29.5
|
| 55 |
+
ipython==8.35.0
|
| 56 |
+
jedi==0.19.2
|
| 57 |
+
Jinja2==3.1.6
|
| 58 |
+
jiter==0.9.0
|
| 59 |
+
joblib==1.4.2
|
| 60 |
+
jsonschema==4.23.0
|
| 61 |
+
jsonschema-specifications==2025.4.1
|
| 62 |
+
jupyter_client==8.6.3
|
| 63 |
+
jupyter_core==5.7.2
|
| 64 |
+
kiwisolver==1.4.8
|
| 65 |
+
langdetect==1.0.9
|
| 66 |
+
markdown2==2.5.3
|
| 67 |
+
markdownify==0.13.1
|
| 68 |
+
marker-pdf==1.6.2
|
| 69 |
+
MarkupSafe==3.0.2
|
| 70 |
+
matplotlib==3.10.1
|
| 71 |
+
matplotlib-inline==0.1.7
|
| 72 |
+
mpmath==1.3.0
|
| 73 |
+
multidict==6.4.3
|
| 74 |
+
multiprocess==0.70.16
|
| 75 |
+
narwhals==1.39.1
|
| 76 |
+
nest-asyncio==1.6.0
|
| 77 |
+
networkx==3.4.2
|
| 78 |
+
nodeenv==1.9.1
|
| 79 |
+
numpy==2.2.4
|
| 80 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 81 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 82 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 83 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 84 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 85 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 86 |
+
nvidia-curand-cu12==10.3.5.147
|
| 87 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 88 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 89 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 90 |
+
nvidia-nccl-cu12==2.21.5
|
| 91 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 92 |
+
nvidia-nvtx-cu12==12.4.127
|
| 93 |
+
openai==1.75.0
|
| 94 |
+
opencv-python==4.11.0.86
|
| 95 |
+
opencv-python-headless==4.11.0.86
|
| 96 |
+
packaging==24.2
|
| 97 |
+
pandas==2.2.3
|
| 98 |
+
parso==0.8.4
|
| 99 |
+
pdf2image==1.17.0
|
| 100 |
+
pdftext==0.6.2
|
| 101 |
+
pexpect==4.9.0
|
| 102 |
+
pillow==10.4.0
|
| 103 |
+
platformdirs==4.3.7
|
| 104 |
+
pre_commit==4.2.0
|
| 105 |
+
prompt_toolkit==3.0.50
|
| 106 |
+
propcache==0.3.1
|
| 107 |
+
protobuf==6.31.0
|
| 108 |
+
psutil==7.0.0
|
| 109 |
+
ptyprocess==0.7.0
|
| 110 |
+
pure_eval==0.2.3
|
| 111 |
+
py-cpuinfo==9.0.0
|
| 112 |
+
pyarrow==19.0.1
|
| 113 |
+
pyasn1==0.6.1
|
| 114 |
+
pyasn1_modules==0.4.2
|
| 115 |
+
pyclipper==1.3.0.post6
|
| 116 |
+
pydantic==2.11.3
|
| 117 |
+
pydantic-settings==2.8.1
|
| 118 |
+
pydantic_core==2.33.1
|
| 119 |
+
pydeck==0.9.1
|
| 120 |
+
Pygments==2.19.1
|
| 121 |
+
PyMuPDF==1.25.5
|
| 122 |
+
pyparsing==3.2.3
|
| 123 |
+
pypdfium2==4.30.0
|
| 124 |
+
pytesseract==0.3.13
|
| 125 |
+
python-dateutil==2.9.0.post0
|
| 126 |
+
python-doctr==0.11.0
|
| 127 |
+
python-dotenv==1.1.0
|
| 128 |
+
pytz==2025.2
|
| 129 |
+
PyYAML==6.0.2
|
| 130 |
+
pyzmq==26.4.0
|
| 131 |
+
qwen-vl-utils==0.0.10
|
| 132 |
+
RapidFuzz==3.13.0
|
| 133 |
+
referencing==0.36.2
|
| 134 |
+
regex==2024.11.6
|
| 135 |
+
requests==2.32.3
|
| 136 |
+
rpds-py==0.25.0
|
| 137 |
+
rsa==4.9.1
|
| 138 |
+
safetensors==0.5.3
|
| 139 |
+
scikit-learn==1.6.1
|
| 140 |
+
scipy==1.15.2
|
| 141 |
+
seaborn==0.13.2
|
| 142 |
+
sentence-transformers==4.1.0
|
| 143 |
+
shapely==2.1.0
|
| 144 |
+
simsimd==6.2.1
|
| 145 |
+
six==1.17.0
|
| 146 |
+
smmap==5.0.2
|
| 147 |
+
sniffio==1.3.1
|
| 148 |
+
soupsieve==2.7
|
| 149 |
+
stack-data==0.6.3
|
| 150 |
+
streamlit==1.45.1
|
| 151 |
+
stringzilla==3.12.3
|
| 152 |
+
surya-ocr==0.13.1
|
| 153 |
+
sympy==1.13.1
|
| 154 |
+
tenacity==9.1.2
|
| 155 |
+
thop==0.1.1.post2209072238
|
| 156 |
+
threadpoolctl==3.6.0
|
| 157 |
+
tokenizers==0.21.1
|
| 158 |
+
toml==0.10.2
|
| 159 |
+
torch==2.6.0
|
| 160 |
+
torchvision==0.21.0
|
| 161 |
+
tornado==6.4.2
|
| 162 |
+
tqdm==4.67.1
|
| 163 |
+
traitlets==5.14.3
|
| 164 |
+
transformers==4.51.2
|
| 165 |
+
triton==3.2.0
|
| 166 |
+
typing-inspection==0.4.0
|
| 167 |
+
typing_extensions==4.13.2
|
| 168 |
+
tzdata==2025.2
|
| 169 |
+
urllib3==2.4.0
|
| 170 |
+
virtualenv==20.30.0
|
| 171 |
+
watchdog==6.0.0
|
| 172 |
+
wcwidth==0.2.13
|
| 173 |
+
websockets==15.0.1
|
| 174 |
+
xxhash==3.5.0
|
| 175 |
+
yarl==1.20.0
|