IotaCluster commited on
Commit
20f0475
ยท
verified ยท
1 Parent(s): 660f406

Update parser_llm_agent.py

Browse files
Files changed (1) hide show
  1. parser_llm_agent.py +133 -133
parser_llm_agent.py CHANGED
@@ -1,133 +1,133 @@
1
- from gradio_client import Client, handle_file
2
- from langchain.tools import Tool
3
- from langchain.agents import initialize_agent
4
- from langchain_groq import ChatGroq
5
- import re
6
- import ast
7
- from dotenv import load_dotenv
8
- from PIL import Image
9
- import os
10
- import tempfile
11
-
12
- load_dotenv()
13
-
14
- def ocr_tool(image_path: str) -> str:
15
- try:
16
- client = Client("IotaCluster/OCR")
17
- # Accept both local file paths and URLs
18
- image_input = image_path
19
- if not (image_path.startswith('http://') or image_path.startswith('https://')):
20
- # Convert PIL image object to a temporary file for compatibility
21
- if isinstance(image_path, Image.Image):
22
- # Save PIL image to a temporary file
23
- temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
24
- image_path.save(temp_file.name, format="PNG")
25
- temp_file.close()
26
- image_input = temp_file.name
27
- print(f"๐Ÿ” Processing: {image_input}")
28
- else:
29
- image_input = handle_file(image_path)
30
- else:
31
- image_input = handle_file(image_path) # handle_file also works with URLs
32
- print("Image path or input for OCR:", image_input) # Debugging image input
33
- result = client.predict(
34
- image=image_input,
35
- language=["eng"],
36
- api_name="/predict"
37
- )
38
- return str(result)
39
- except Exception as e:
40
- return f"OCR failed: {str(e)}"
41
-
42
- def extract_name_and_price_from_image(image):
43
- # Process the image outside the tool, similar to doc_llm_agent
44
- temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
45
- image.save(temp_file.name, format="PNG")
46
- temp_file.close()
47
- image_input = temp_file.name
48
- print(f"๐Ÿ” Processing: {image_input}")
49
-
50
- ocr_text = agent.run(f"Use ImageOCR to extract text from image: {image_input}")
51
- print("OCR Text:", ocr_text) # Debugging OCR output
52
-
53
- prompt = (
54
- f"You are a helpful AI. Here is some OCR text extracted from an image:\n{ocr_text}\n\n"
55
- f"Your task is to extract the Name and Price from the text. Generalize the Name to make it more small if possible. If no Name or Price is found, return NONE for each."
56
- f" Return the result as a raw list of Python dictionaries with keys 'Name' and 'Price'."
57
- )
58
-
59
- llm_response = agent.run(prompt)
60
-
61
- # Extract the dictionary from the LLM output using regex
62
- dict_match = re.search(r'\{[^\{\}]*\}', llm_response, re.DOTALL)
63
- if dict_match:
64
- dict_str = dict_match.group(0)
65
- try:
66
- result_dict = ast.literal_eval(dict_str)
67
- except Exception:
68
- result_dict = {"Name": "NONE", "Price": "NONE"}
69
- else:
70
- result_dict = {"Name": "NONE", "Price": "NONE"}
71
-
72
- return result_dict
73
-
74
- def extract_name_and_price_from_images(image_list):
75
- results = []
76
-
77
- # Initialize the agent locally
78
- ocr_tool_instance = Tool(
79
- name="ImageOCR",
80
- func=ocr_tool,
81
- description="Extracts text content from an image using OCR via Gradio"
82
- )
83
-
84
- llm = ChatGroq(temperature=0, model_name="qwen/qwen3-32b")
85
- agent = initialize_agent(
86
- tools=[ocr_tool_instance],
87
- llm=llm,
88
- agent_type="zero-shot-react-description",
89
- verbose=True,
90
- handle_parsing_errors=True,
91
- max_iterations=5 # Reduce iterations to prevent infinite loops
92
- )
93
-
94
- for image in image_list:
95
- # Process each image
96
- temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
97
- image.save(temp_file.name, format="PNG")
98
- temp_file.close()
99
- image_input = temp_file.name
100
- print(f"๐Ÿ” Processing: {image_input}")
101
-
102
- ocr_text = agent.run(f"Use ocr tool to extract text from image: {image_input}")
103
- print("OCR Text:", ocr_text) # Debugging OCR output
104
-
105
- prompt = (
106
- f"You are a helpful AI. Here is some OCR text extracted from an image:\n{ocr_text}\n\n"
107
- f"Your task is to extract the Name and Price from the text. Generalize the Name to make it more small if possible. If no Name or Price is found, return NONE for each."
108
- f" Return the result as a raw list of Python dictionaries with keys 'Name' and 'Price'."
109
- )
110
-
111
- llm_response = agent.run(prompt)
112
-
113
- # Extract the dictionary from the LLM output using regex
114
- dict_match = re.search(r'\{[^\{\}]*\}', llm_response, re.DOTALL)
115
- if dict_match:
116
- dict_str = dict_match.group(0)
117
- try:
118
- result_dict = ast.literal_eval(dict_str)
119
- except Exception:
120
- result_dict = {"Name": "NONE", "Price": "NONE"}
121
- else:
122
- result_dict = {"Name": "NONE", "Price": "NONE"}
123
-
124
- results.append(result_dict)
125
-
126
- return results
127
-
128
- # Example usage
129
- if __name__ == "__main__":
130
- from PIL import Image
131
- image_list = [Image.open("4090.png"), Image.open("eg_img.png"), Image.open("5080.png")]
132
- result = extract_name_and_price_from_images(image_list)
133
- print(result)
 
1
+ from gradio_client import Client, handle_file
2
+ from langchain.tools import Tool
3
+ from langchain.agents import initialize_agent
4
+ from langchain_groq import ChatGroq
5
+ import re
6
+ import ast
7
+ from dotenv import load_dotenv
8
+ from PIL import Image
9
+ import os
10
+ import tempfile
11
+
12
+ load_dotenv()
13
+
14
+ def ocr_tool(image_path: str) -> str:
15
+ try:
16
+ client = Client("IotaCluster/OCR")
17
+ # Accept both local file paths and URLs
18
+ image_input = image_path
19
+ if not (image_path.startswith('http://') or image_path.startswith('https://')):
20
+ # Convert PIL image object to a temporary file for compatibility
21
+ if isinstance(image_path, Image.Image):
22
+ # Save PIL image to a temporary file
23
+ temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
24
+ image_path.save(temp_file.name, format="PNG")
25
+ temp_file.close()
26
+ image_input = temp_file.name
27
+ print(f"๐Ÿ” Processing: {image_input}")
28
+ else:
29
+ image_input = handle_file(image_path)
30
+ else:
31
+ image_input = handle_file(image_path) # handle_file also works with URLs
32
+ print("Image path or input for OCR:", image_input) # Debugging image input
33
+ result = client.predict(
34
+ image=image_input,
35
+ language=["eng"],
36
+ api_name="/predict"
37
+ )
38
+ return str(result)
39
+ except Exception as e:
40
+ return f"OCR failed: {str(e)}"
41
+
42
+ def extract_name_and_price_from_image(image):
43
+ # Process the image outside the tool, similar to doc_llm_agent
44
+ temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
45
+ image.save(temp_file.name, format="PNG")
46
+ temp_file.close()
47
+ image_input = temp_file.name
48
+ print(f"๐Ÿ” Processing: {image_input}")
49
+
50
+ ocr_text = agent.run(f"Use ImageOCR to extract text from image: {image_input}")
51
+ print("OCR Text:", ocr_text) # Debugging OCR output
52
+
53
+ prompt = (
54
+ f"You are a helpful AI. Here is some OCR text extracted from an image:\n{ocr_text}\n\n"
55
+ f"Your task is to extract the Name and Price from the text. Generalize the Name to make it more small if possible. If no Name or Price is found, return NONE for each."
56
+ f" Return the result as a raw list of Python dictionaries with keys 'Name' and 'Price'."
57
+ )
58
+
59
+ llm_response = agent.run(prompt)
60
+
61
+ # Extract the dictionary from the LLM output using regex
62
+ dict_match = re.search(r'\{[^\{\}]*\}', llm_response, re.DOTALL)
63
+ if dict_match:
64
+ dict_str = dict_match.group(0)
65
+ try:
66
+ result_dict = ast.literal_eval(dict_str)
67
+ except Exception:
68
+ result_dict = {"Name": "NONE", "Price": "NONE"}
69
+ else:
70
+ result_dict = {"Name": "NONE", "Price": "NONE"}
71
+
72
+ return result_dict
73
+
74
+ def extract_name_and_price_from_images(image_list):
75
+ results = []
76
+
77
+ # Initialize the agent locally
78
+ ocr_tool_instance = Tool(
79
+ name="ImageOCR",
80
+ func=ocr_tool,
81
+ description="Extracts text content from an image using OCR via Gradio"
82
+ )
83
+
84
+ llm = ChatGroq(temperature=0, model_name="qwen/qwen3-32b", api_key=os.getenv("GROQ_API_KEY1"))
85
+ agent = initialize_agent(
86
+ tools=[ocr_tool_instance],
87
+ llm=llm,
88
+ agent_type="zero-shot-react-description",
89
+ verbose=True,
90
+ handle_parsing_errors=True,
91
+ max_iterations=5 # Reduce iterations to prevent infinite loops
92
+ )
93
+
94
+ for image in image_list:
95
+ # Process each image
96
+ temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
97
+ image.save(temp_file.name, format="PNG")
98
+ temp_file.close()
99
+ image_input = temp_file.name
100
+ print(f"๐Ÿ” Processing: {image_input}")
101
+
102
+ ocr_text = agent.run(f"Use ocr tool to extract text from image: {image_input}")
103
+ print("OCR Text:", ocr_text) # Debugging OCR output
104
+
105
+ prompt = (
106
+ f"You are a helpful AI. Here is some OCR text extracted from an image:\n{ocr_text}\n\n"
107
+ f"Your task is to extract the Name and Price from the text. Generalize the Name to make it more small if possible. If no Name or Price is found, return NONE for each."
108
+ f" Return the result as a raw list of Python dictionaries with keys 'Name' and 'Price'."
109
+ )
110
+
111
+ llm_response = agent.run(prompt)
112
+
113
+ # Extract the dictionary from the LLM output using regex
114
+ dict_match = re.search(r'\{[^\{\}]*\}', llm_response, re.DOTALL)
115
+ if dict_match:
116
+ dict_str = dict_match.group(0)
117
+ try:
118
+ result_dict = ast.literal_eval(dict_str)
119
+ except Exception:
120
+ result_dict = {"Name": "NONE", "Price": "NONE"}
121
+ else:
122
+ result_dict = {"Name": "NONE", "Price": "NONE"}
123
+
124
+ results.append(result_dict)
125
+
126
+ return results
127
+
128
+ # Example usage
129
+ if __name__ == "__main__":
130
+ from PIL import Image
131
+ image_list = [Image.open("4090.png"), Image.open("eg_img.png"), Image.open("5080.png")]
132
+ result = extract_name_and_price_from_images(image_list)
133
+ print(result)