IotaCluster commited on
Commit
d3c629d
·
verified ·
1 Parent(s): 80690cc

Update draft_pipeline.py

Browse files
Files changed (1) hide show
  1. draft_pipeline.py +43 -44
draft_pipeline.py CHANGED
@@ -1,44 +1,43 @@
1
- from doc_scraper import extract_first_table_as_dict
2
- from screenshoter import get_screenshot
3
- from parser_llm_agent import extract_name_and_price_from_images
4
- from doc_llm_agent import compile_images_and_text_to_doc
5
- from PIL import Image
6
-
7
- def process_document_pipeline(doc_path):
8
- # Step 1: Scrape the first table from the document
9
- scraped_data = extract_first_table_as_dict(doc_path)
10
-
11
- # Step 2: Take screenshots of links present in the scraped dictionaries
12
- pil_images = []
13
- for data in scraped_data:
14
- link = data.get("Link")
15
- if link:
16
- screenshot = get_screenshot(link)
17
- pil_images.append(screenshot)
18
-
19
- # Step 3: Extract Name and Price from the images
20
- extracted_data = extract_name_and_price_from_images(pil_images)
21
-
22
- # Step 4: Mix each dictionary with extracted Name and Price
23
- final_data = []
24
- for original_data, extracted_info in zip(scraped_data, extracted_data):
25
- mixed_data = {
26
- "Name": extracted_info.get("Name", "NONE"),
27
- "Price": extracted_info.get("Price", "NONE"),
28
- "Quantity": original_data.get("Quantity", "NONE")
29
- }
30
- final_data.append(mixed_data)
31
-
32
- # Step 5: Compile the images and final data into a document
33
- compiled_doc_path = "compiled_output_with_images.docx"
34
- compile_images_and_text_to_doc(pil_images, final_data)
35
-
36
- return pil_images, final_data, compiled_doc_path
37
-
38
- # Example usage
39
- if __name__ == "__main__":
40
- doc_path = "sample.docx" # Path to the uploaded document
41
- images, dictionaries, compiled_doc = process_document_pipeline(doc_path)
42
- print("Processed Images:", images)
43
- print("Processed Data:", dictionaries)
44
- print("Compiled Document Path:", compiled_doc)
 
1
+ from doc_scraper import extract_first_table_as_dict
2
+ from screenshoter import get_screenshot
3
+ from parser_llm_agent import extract_name_and_price_from_images
4
+ from doc_llm_agent import compile_images_and_text_to_doc
5
+ from PIL import Image
6
+
7
+ def process_document_pipeline(doc_path):
8
+ # Step 1: Scrape the first table from the document
9
+ scraped_data = extract_first_table_as_dict(doc_path)
10
+
11
+ # Step 2: Take screenshots of links present in the scraped dictionaries
12
+ pil_images = []
13
+ for data in scraped_data:
14
+ link = data.get("Link")
15
+ if link:
16
+ screenshot = get_screenshot(link)
17
+ pil_images.append(screenshot)
18
+
19
+ # Step 3: Extract Name and Price from the images
20
+ extracted_data = extract_name_and_price_from_images(pil_images)
21
+
22
+ # Step 4: Mix each dictionary with extracted Name and Price
23
+ final_data = []
24
+ for original_data, extracted_info in zip(scraped_data, extracted_data):
25
+ mixed_data = {
26
+ "Name": extracted_info.get("Name", "NONE"),
27
+ "Price": extracted_info.get("Price", "NONE"),
28
+ "Quantity": original_data.get("Quantity", "NONE")
29
+ }
30
+ final_data.append(mixed_data)
31
+
32
+ # Step 5: Compile the images and final data into a document
33
+ compiled_doc = compile_images_and_text_to_doc(pil_images, final_data)
34
+
35
+ return pil_images, final_data, compiled_doc
36
+
37
+ # Example usage
38
+ if __name__ == "__main__":
39
+ doc_path = "sample.docx" # Path to the uploaded document
40
+ images, dictionaries, compiled_doc = process_document_pipeline(doc_path)
41
+ print("Processed Images:", images)
42
+ print("Processed Data:", dictionaries)
43
+ print("Compiled Document Object:", compiled_doc)