Prasanthkumar commited on
Commit
be87a5d
·
verified ·
1 Parent(s): 362630b

Update document_parser.py

Browse files
Files changed (1) hide show
  1. document_parser.py +160 -0
document_parser.py CHANGED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import requests
4
+ import tempfile
5
+ from PIL import Image
6
+ import pytesseract
7
+ import pandas as pd
8
+ from urllib.parse import urlparse
9
+ from langchain_core.tools import tool
10
+ from typing import Optional
11
+ import logging
12
+ import pandasql as psql
13
+
14
+ # ------------------- 🔧 Logger Setup -------------------
15
+ def setup_logger():
16
+ logger = logging.getLogger("FileToolLogger")
17
+ logger.setLevel(logging.INFO)
18
+ if not logger.handlers:
19
+ handler = logging.StreamHandler()
20
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
21
+ handler.setFormatter(formatter)
22
+ logger.addHandler(handler)
23
+ return logger
24
+
25
+ logger = setup_logger()
26
+
27
+ # ------------------- 📄 Save Content to File -------------------
28
+ @tool
29
+ def save_and_read_file(content: str, filename: Optional[str] = None) -> str:
30
+ """
31
+ Save content to a file and return the path.
32
+ Args:
33
+ content (str): the content to save to the file
34
+ filename (str, optional): the name of the file. If not provided, a random name file will be created.
35
+ """
36
+ temp_dir = tempfile.gettempdir()
37
+ if filename is None:
38
+ temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
39
+ filepath = temp_file.name
40
+ else:
41
+ filepath = os.path.join(temp_dir, filename)
42
+
43
+ with open(filepath, "w") as f:
44
+ f.write(content)
45
+
46
+ return f"File saved to {filepath}. You can read this file to process its contents."
47
+
48
+ # ------------------- 📄 Save Content to File -------------------
49
+
50
+ @tool
51
+ def download_file_from_url(url: str, filename: Optional[str] = None) -> str:
52
+ """
53
+ Download a file from a URL and save it to a temporary location.
54
+ Args:
55
+ url (str): the URL of the file to download.
56
+ filename (str, optional): the name of the file. If not provided, a random name file will be created.
57
+ """
58
+ try:
59
+ # Parse URL to get filename if not provided
60
+ if not filename:
61
+ path = urlparse(url).path
62
+ filename = os.path.basename(path)
63
+ if not filename:
64
+ filename = f"downloaded_{uuid.uuid4().hex[:8]}"
65
+
66
+ # Create temporary file
67
+ temp_dir = tempfile.gettempdir()
68
+ filepath = os.path.join(temp_dir, filename)
69
+
70
+ # Download the file
71
+ response = requests.get(url, stream=True)
72
+ response.raise_for_status()
73
+
74
+ # Save the file
75
+ with open(filepath, "wb") as f:
76
+ for chunk in response.iter_content(chunk_size=8192):
77
+ f.write(chunk)
78
+
79
+ return f"File downloaded to {filepath}. You can read this file to process its contents."
80
+ except Exception as e:
81
+ return f"Error downloading file: {str(e)}"
82
+
83
+ @tool
84
+ def extract_text_from_image(image_path: str) -> str:
85
+ """
86
+ Extract text from an image using OCR library pytesseract (if available).
87
+ Args:
88
+ image_path (str): the path to the image file.
89
+ """
90
+ try:
91
+
92
+ # Open the image
93
+ image = Image.open(image_path)
94
+
95
+ # Extract text from the image
96
+ text = pytesseract.image_to_string(image)
97
+ return f"Extracted text from image:\n\n{text}"
98
+
99
+ except Exception as e:
100
+ return f"Error extracting text from image: {str(e)}"
101
+
102
+ @tool
103
+ def analyze_csv_file(file_path: str, query: Optional[str] = None) -> str:
104
+
105
+ """
106
+ Analyze a CSV file using pandas and answer a question about it.
107
+ Args:
108
+ file_path (str): the path to the CSV file.
109
+ query (str): Question about the data
110
+ """
111
+ if not os.path.isfile(file_path) or not file_path.endswith((".csv")):
112
+ return "Invalid or missing csv file."
113
+ try :
114
+
115
+ df = pd.read_csv(file_path)
116
+ columns = df.columns
117
+ result = [f"CSV loaded with shape: {df.shape}", f" Columns: {', '.join(columns)}"]
118
+
119
+ if query:
120
+ result.append(f"\n Query: {query}")
121
+ result_df = psql.sqldf(query, {"df": df})
122
+ result.append("Query Result:\n" + result_df.to_string(index=False))
123
+ else:
124
+ result.append("\nSummary:\n" + str(df.describe(include='all')))
125
+
126
+ return "\n".join(result)
127
+
128
+ except Exception as e:
129
+ return f"Error analyzing CSV file: {str(e)}"
130
+
131
+ @tool
132
+ def analyze_excel_file(file_path: str, query: Optional[str] = None) -> str:
133
+
134
+ """
135
+ Analyze a excel file using pandas and answer a question about it.
136
+ Args:
137
+ file_path (str): the path to the xls or xlsx file.
138
+ query (str): Question about the data
139
+ """
140
+ if not os.path.isfile(file_path) or not file_path.endswith((".xls", ".xlsx")):
141
+ return "Invalid or missing Excel file."
142
+
143
+ try :
144
+
145
+ df = pd.read_excel(file_path)
146
+ columns = df.columns
147
+ result = [f"CSV loaded with shape: {df.shape}", f" Columns: {', '.join(columns)}"]
148
+
149
+ if query:
150
+ result.append(f"\n Query: {query}")
151
+ result_df = psql.sqldf(query, {"df": df})
152
+ result.append("Query Result:\n" + result_df.to_string(index=False))
153
+ else:
154
+ result.append("\nSummary:\n" + str(df.describe(include='all')))
155
+
156
+ return "\n".join(result)
157
+
158
+ except Exception as e:
159
+ return f"Error analyzing Excel file: {str(e)}"
160
+