| # import base64 | |
| # import os | |
| # import requests | |
| # API_URL = "https://j103s9d2e4dccfoc.aistudio-app.com/layout-parsing" | |
| # TOKEN = "3f77a7dbc0756fef96d9d6791f485b062106ef51" | |
| # # file_path = r"C:\Users\shiva\OneDrive\Desktop\ip.pdf" | |
| # # ---------- READ PDF ---------- | |
| # def VL_model(file_path: str): | |
| # with open(file_path, "rb") as file: | |
| # file_bytes = file.read() | |
| # file_data = base64.b64encode(file_bytes).decode("ascii") | |
| # headers = { | |
| # "Authorization": f"token {TOKEN}", | |
| # "Content-Type": "application/json" | |
| # } | |
| # payload = { | |
| # "file": file_data, | |
| # "fileType": 0, | |
| # "useDocOrientationClassify": False, | |
| # "useDocUnwarping": False, | |
| # "useChartRecognition": False, | |
| # } | |
| # response = requests.post(API_URL, json=payload, headers=headers) | |
| # assert response.status_code == 200 | |
| # result = response.json()["result"] | |
| # # ---------- OUTPUT ---------- | |
| # output_dir = "img2" | |
| # os.makedirs(output_dir, exist_ok=True) | |
| # FINAL_MD_PATH = os.path.join(output_dir, "final_document.md") | |
| # with open(FINAL_MD_PATH, "w", encoding="utf-8") as final_md: | |
| # for i, res in enumerate(result["layoutParsingResults"]): | |
| # # Page separator (VERY IMPORTANT for parsing later) | |
| # final_md.write(f"\n\n---\n## PAGE {i+1}\n---\n\n") | |
| # # Append markdown text | |
| # final_md.write(res["markdown"]["text"]) | |
| # final_md.write("\n") | |
| # # Save embedded images | |
| # for img_path, img_url in res["markdown"]["images"].items(): | |
| # full_img_path = os.path.join(output_dir, img_path) | |
| # os.makedirs(os.path.dirname(full_img_path), exist_ok=True) | |
| # img_bytes = requests.get(img_url).content | |
| # with open(full_img_path, "wb") as img_file: | |
| # img_file.write(img_bytes) | |
| # # Save detected layout images (figures, tables) | |
| # for img_name, img_url in res["outputImages"].items(): | |
| # img_response = requests.get(img_url) | |
| # if img_response.status_code == 200: | |
| # filename = os.path.join(output_dir, f"{img_name}_page{i+1}.jpg") | |
| # with open(filename, "wb") as f: | |
| # f.write(img_response.content) | |
| # print(f"β Single merged markdown saved at: {FINAL_MD_PATH}") | |
| # return FINAL_MD_PATH | |
| import base64 | |
| import os | |
| import requests | |
| API_URL = "https://j103s9d2e4dccfoc.aistudio-app.com/layout-parsing" | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| TOKEN = os.getenv("TOKEN") | |
| def VL_model(file_path: str, query_id: int): | |
| # ---------- READ PDF ---------- | |
| with open(file_path, "rb") as file: | |
| file_data = base64.b64encode(file.read()).decode("ascii") | |
| headers = { | |
| "Authorization": f"token {TOKEN}", | |
| "Content-Type": "application/json" | |
| } | |
| payload = { | |
| "file": file_data, | |
| "fileType": 0, | |
| "useDocOrientationClassify": False, | |
| "useDocUnwarping": False, | |
| "useChartRecognition": False, | |
| } | |
| response = requests.post(API_URL, json=payload, headers=headers) | |
| response.raise_for_status() | |
| result = response.json()["result"] | |
| # ---------- OUTPUT STRUCTURE ---------- | |
| BASE_DIR = "vl_output_bro" | |
| IMG_DIR = os.path.join(BASE_DIR, "imgs") | |
| QUERY_DIR = os.path.join(BASE_DIR, f"query_{query_id}") | |
| os.makedirs(IMG_DIR, exist_ok=True) | |
| os.makedirs(QUERY_DIR, exist_ok=True) | |
| FINAL_MD_PATH = os.path.join(QUERY_DIR, "final_document.md") | |
| with open(FINAL_MD_PATH, "w", encoding="utf-8") as final_md: | |
| for page_no, res in enumerate(result["layoutParsingResults"], start=1): | |
| # ---------- PAGE HEADER ---------- | |
| final_md.write(f"\n\n---\n## PAGE {page_no}\n---\n\n") | |
| final_md.write(res["markdown"]["text"]) | |
| final_md.write("\n") | |
| # ---------- EMBEDDED MARKDOWN IMAGES ---------- | |
| for img_path, img_url in res["markdown"]["images"].items(): | |
| # β USE EXACT SAME IMAGE NAME AS HTML | |
| img_name = os.path.basename(img_path) | |
| img_file_path = os.path.join(IMG_DIR, img_name) | |
| if not os.path.exists(img_file_path): | |
| img_bytes = requests.get(img_url).content | |
| with open(img_file_path, "wb") as img_file: | |
| img_file.write(img_bytes) | |
| # β RELATIVE PATH FROM query_X β imgs | |
| final_md.write(f"\n\n") | |
| # ---------- DETECTED LAYOUT IMAGES ---------- | |
| for img_name, img_url in res["outputImages"].items(): | |
| img_name = f"{img_name}.jpg" | |
| img_file_path = os.path.join(IMG_DIR, img_name) | |
| if not os.path.exists(img_file_path): | |
| img_bytes = requests.get(img_url).content | |
| with open(img_file_path, "wb") as f: | |
| f.write(img_bytes) | |
| print(f"β VL markdown saved at: {FINAL_MD_PATH}") | |
| print(f"β Images saved in: {IMG_DIR}") | |
| return FINAL_MD_PATH | |
| # VL_model(file_path=r"C:\\Users\\shiva\\OneDrive\\Desktop\\mini 2.0\\queries\\1c835fef22ef433288d92f00f24f21aa_daa.pdf", query_id=1) | |