Spaces:
Sleeping
Sleeping
| from doc_scraper import extract_first_table_as_dict | |
| from screenshoter import get_screenshot | |
| from parser_llm_agent import extract_name_and_price_from_images | |
| from doc_llm_agent import compile_images_and_text_to_doc | |
| from PIL import Image | |
| def process_document_pipeline(doc_path): | |
| # Step 1: Scrape the first table from the document | |
| scraped_data = extract_first_table_as_dict(doc_path) | |
| # Step 2: Take screenshots of links present in the scraped dictionaries | |
| pil_images = [] | |
| for data in scraped_data: | |
| link = data.get("Link") | |
| if link: | |
| screenshot = get_screenshot(link) | |
| pil_images.append(screenshot) | |
| # Step 3: Extract Name and Price from the images | |
| extracted_data = extract_name_and_price_from_images(pil_images) | |
| # Step 4: Mix each dictionary with extracted Name and Price | |
| final_data = [] | |
| for original_data, extracted_info in zip(scraped_data, extracted_data): | |
| mixed_data = { | |
| "Name": extracted_info.get("Name", "NONE"), | |
| "Price": extracted_info.get("Price", "NONE"), | |
| "Quantity": original_data.get("Quantity", "NONE") | |
| } | |
| final_data.append(mixed_data) | |
| # Step 5: Compile the images and final data into a document | |
| compiled_doc = compile_images_and_text_to_doc(pil_images, final_data) | |
| return pil_images, final_data, compiled_doc | |
| # Example usage | |
| if __name__ == "__main__": | |
| doc_path = "sample.docx" # Path to the uploaded document | |
| images, dictionaries, compiled_doc = process_document_pipeline(doc_path) | |
| print("Processed Images:", images) | |
| print("Processed Data:", dictionaries) | |
| print("Compiled Document Object:", compiled_doc) |