File size: 5,470 Bytes
cba2c8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
# import base64
# import os
# import requests
# API_URL = "https://j103s9d2e4dccfoc.aistudio-app.com/layout-parsing"
# TOKEN = "3f77a7dbc0756fef96d9d6791f485b062106ef51"
# # file_path = r"C:\Users\shiva\OneDrive\Desktop\ip.pdf"
# # ---------- READ PDF ----------
# def VL_model(file_path: str):
# with open(file_path, "rb") as file:
# file_bytes = file.read()
# file_data = base64.b64encode(file_bytes).decode("ascii")
# headers = {
# "Authorization": f"token {TOKEN}",
# "Content-Type": "application/json"
# }
# payload = {
# "file": file_data,
# "fileType": 0,
# "useDocOrientationClassify": False,
# "useDocUnwarping": False,
# "useChartRecognition": False,
# }
# response = requests.post(API_URL, json=payload, headers=headers)
# assert response.status_code == 200
# result = response.json()["result"]
# # ---------- OUTPUT ----------
# output_dir = "img2"
# os.makedirs(output_dir, exist_ok=True)
# FINAL_MD_PATH = os.path.join(output_dir, "final_document.md")
# with open(FINAL_MD_PATH, "w", encoding="utf-8") as final_md:
# for i, res in enumerate(result["layoutParsingResults"]):
# # Page separator (VERY IMPORTANT for parsing later)
# final_md.write(f"\n\n---\n## PAGE {i+1}\n---\n\n")
# # Append markdown text
# final_md.write(res["markdown"]["text"])
# final_md.write("\n")
# # Save embedded images
# for img_path, img_url in res["markdown"]["images"].items():
# full_img_path = os.path.join(output_dir, img_path)
# os.makedirs(os.path.dirname(full_img_path), exist_ok=True)
# img_bytes = requests.get(img_url).content
# with open(full_img_path, "wb") as img_file:
# img_file.write(img_bytes)
# # Save detected layout images (figures, tables)
# for img_name, img_url in res["outputImages"].items():
# img_response = requests.get(img_url)
# if img_response.status_code == 200:
# filename = os.path.join(output_dir, f"{img_name}_page{i+1}.jpg")
# with open(filename, "wb") as f:
# f.write(img_response.content)
# print(f"β
Single merged markdown saved at: {FINAL_MD_PATH}")
# return FINAL_MD_PATH
import base64
import os
import requests
API_URL = "https://j103s9d2e4dccfoc.aistudio-app.com/layout-parsing"
from dotenv import load_dotenv
load_dotenv()
TOKEN = os.getenv("TOKEN")
def VL_model(file_path: str, query_id: int):
# ---------- READ PDF ----------
with open(file_path, "rb") as file:
file_data = base64.b64encode(file.read()).decode("ascii")
headers = {
"Authorization": f"token {TOKEN}",
"Content-Type": "application/json"
}
payload = {
"file": file_data,
"fileType": 0,
"useDocOrientationClassify": False,
"useDocUnwarping": False,
"useChartRecognition": False,
}
response = requests.post(API_URL, json=payload, headers=headers)
response.raise_for_status()
result = response.json()["result"]
# ---------- OUTPUT STRUCTURE ----------
BASE_DIR = "vl_output_bro"
IMG_DIR = os.path.join(BASE_DIR, "imgs")
QUERY_DIR = os.path.join(BASE_DIR, f"query_{query_id}")
os.makedirs(IMG_DIR, exist_ok=True)
os.makedirs(QUERY_DIR, exist_ok=True)
FINAL_MD_PATH = os.path.join(QUERY_DIR, "final_document.md")
with open(FINAL_MD_PATH, "w", encoding="utf-8") as final_md:
for page_no, res in enumerate(result["layoutParsingResults"], start=1):
# ---------- PAGE HEADER ----------
final_md.write(f"\n\n---\n## PAGE {page_no}\n---\n\n")
final_md.write(res["markdown"]["text"])
final_md.write("\n")
# ---------- EMBEDDED MARKDOWN IMAGES ----------
for img_path, img_url in res["markdown"]["images"].items():
# β
USE EXACT SAME IMAGE NAME AS HTML
img_name = os.path.basename(img_path)
img_file_path = os.path.join(IMG_DIR, img_name)
if not os.path.exists(img_file_path):
img_bytes = requests.get(img_url).content
with open(img_file_path, "wb") as img_file:
img_file.write(img_bytes)
# β
RELATIVE PATH FROM query_X β imgs
final_md.write(f"\n\n")
# ---------- DETECTED LAYOUT IMAGES ----------
for img_name, img_url in res["outputImages"].items():
img_name = f"{img_name}.jpg"
img_file_path = os.path.join(IMG_DIR, img_name)
if not os.path.exists(img_file_path):
img_bytes = requests.get(img_url).content
with open(img_file_path, "wb") as f:
f.write(img_bytes)
print(f"β
VL markdown saved at: {FINAL_MD_PATH}")
print(f"β
Images saved in: {IMG_DIR}")
return FINAL_MD_PATH
# VL_model(file_path=r"C:\\Users\\shiva\\OneDrive\\Desktop\\mini 2.0\\queries\\1c835fef22ef433288d92f00f24f21aa_daa.pdf", query_id=1)
|