Prasanthkumar commited on
Commit
b74602e
·
verified ·
1 Parent(s): 1736506

Delete document_parser.py

Browse files
Files changed (1) hide show
  1. document_parser.py +0 -160
document_parser.py DELETED
@@ -1,160 +0,0 @@
1
- import os
2
- import uuid
3
- import requests
4
- import tempfile
5
- from PIL import Image
6
- import pytesseract
7
- import pandas as pd
8
- from urllib.parse import urlparse
9
- from langchain_core.tools import tool
10
- from typing import Optional
11
- import logging
12
- import pandasql as psql
13
-
14
- # ------------------- 🔧 Logger Setup -------------------
15
- def setup_logger():
16
- logger = logging.getLogger("FileToolLogger")
17
- logger.setLevel(logging.INFO)
18
- if not logger.handlers:
19
- handler = logging.StreamHandler()
20
- formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
21
- handler.setFormatter(formatter)
22
- logger.addHandler(handler)
23
- return logger
24
-
25
- logger = setup_logger()
26
-
27
- # ------------------- 📄 Save Content to File -------------------
28
- @tool
29
- def save_and_read_file(content: str, filename: Optional[str] = None) -> str:
30
- """
31
- Save content to a file and return the path.
32
- Args:
33
- content (str): the content to save to the file
34
- filename (str, optional): the name of the file. If not provided, a random name file will be created.
35
- """
36
- temp_dir = tempfile.gettempdir()
37
- if filename is None:
38
- temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
39
- filepath = temp_file.name
40
- else:
41
- filepath = os.path.join(temp_dir, filename)
42
-
43
- with open(filepath, "w") as f:
44
- f.write(content)
45
-
46
- return f"File saved to {filepath}. You can read this file to process its contents."
47
-
48
- # ------------------- 📄 Save Content to File -------------------
49
-
50
- @tool
51
- def download_file_from_url(url: str, filename: Optional[str] = None) -> str:
52
- """
53
- Download a file from a URL and save it to a temporary location.
54
- Args:
55
- url (str): the URL of the file to download.
56
- filename (str, optional): the name of the file. If not provided, a random name file will be created.
57
- """
58
- try:
59
- # Parse URL to get filename if not provided
60
- if not filename:
61
- path = urlparse(url).path
62
- filename = os.path.basename(path)
63
- if not filename:
64
- filename = f"downloaded_{uuid.uuid4().hex[:8]}"
65
-
66
- # Create temporary file
67
- temp_dir = tempfile.gettempdir()
68
- filepath = os.path.join(temp_dir, filename)
69
-
70
- # Download the file
71
- response = requests.get(url, stream=True)
72
- response.raise_for_status()
73
-
74
- # Save the file
75
- with open(filepath, "wb") as f:
76
- for chunk in response.iter_content(chunk_size=8192):
77
- f.write(chunk)
78
-
79
- return f"File downloaded to {filepath}. You can read this file to process its contents."
80
- except Exception as e:
81
- return f"Error downloading file: {str(e)}"
82
-
83
- @tool
84
- def extract_text_from_image(image_path: str) -> str:
85
- """
86
- Extract text from an image using OCR library pytesseract (if available).
87
- Args:
88
- image_path (str): the path to the image file.
89
- """
90
- try:
91
-
92
- # Open the image
93
- image = Image.open(image_path)
94
-
95
- # Extract text from the image
96
- text = pytesseract.image_to_string(image)
97
- return f"Extracted text from image:\n\n{text}"
98
-
99
- except Exception as e:
100
- return f"Error extracting text from image: {str(e)}"
101
-
102
- @tool
103
- def analyze_csv_file(file_path: str, query: Optional[str] = None) -> str:
104
-
105
- """
106
- Analyze a CSV file using pandas and answer a question about it.
107
- Args:
108
- file_path (str): the path to the CSV file.
109
- query (str): Question about the data
110
- """
111
- if not os.path.isfile(file_path) or not file_path.endswith((".csv")):
112
- return "Invalid or missing csv file."
113
- try :
114
-
115
- df = pd.read_csv(file_path)
116
- columns = df.columns
117
- result = [f"CSV loaded with shape: {df.shape}", f" Columns: {', '.join(columns)}"]
118
-
119
- if query:
120
- result.append(f"\n Query: {query}")
121
- result_df = psql.sqldf(query, {"df": df})
122
- result.append("Query Result:\n" + result_df.to_string(index=False))
123
- else:
124
- result.append("\nSummary:\n" + str(df.describe(include='all')))
125
-
126
- return "\n".join(result)
127
-
128
- except Exception as e:
129
- return f"Error analyzing CSV file: {str(e)}"
130
-
131
- @tool
132
- def analyze_excel_file(file_path: str, query: Optional[str] = None) -> str:
133
-
134
- """
135
- Analyze a excel file using pandas and answer a question about it.
136
- Args:
137
- file_path (str): the path to the xls or xlsx file.
138
- query (str): Question about the data
139
- """
140
- if not os.path.isfile(file_path) or not file_path.endswith((".xls", ".xlsx")):
141
- return "Invalid or missing Excel file."
142
-
143
- try :
144
-
145
- df = pd.read_excel(file_path)
146
- columns = df.columns
147
- result = [f"CSV loaded with shape: {df.shape}", f" Columns: {', '.join(columns)}"]
148
-
149
- if query:
150
- result.append(f"\n Query: {query}")
151
- result_df = psql.sqldf(query, {"df": df})
152
- result.append("Query Result:\n" + result_df.to_string(index=False))
153
- else:
154
- result.append("\nSummary:\n" + str(df.describe(include='all')))
155
-
156
- return "\n".join(result)
157
-
158
- except Exception as e:
159
- return f"Error analyzing Excel file: {str(e)}"
160
-