srivatsavdamaraju commited on
Commit
3aa881c
·
verified ·
1 Parent(s): 32cb86b

Update s3/meta_data_creation_from_s3.py

Browse files
Files changed (1) hide show
  1. s3/meta_data_creation_from_s3.py +168 -167
s3/meta_data_creation_from_s3.py CHANGED
@@ -1,167 +1,168 @@
1
- import os
2
- import io
3
- import json
4
- import datetime
5
- import pandas as pd
6
- from langchain.chat_models import ChatOpenAI
7
- from langchain.prompts import PromptTemplate
8
- import boto3
9
- from io import StringIO
10
- from botocore.exceptions import ClientError
11
- import dotenv
12
- import os
13
- import sys
14
-
15
- from pathlib import Path
16
- from typing import List
17
- PROJECT_ROOT = Path(__file__).resolve().parents[1]
18
- if str(PROJECT_ROOT) not in sys.path:
19
- sys.path.insert(0, str(PROJECT_ROOT))
20
- from retrieve_secret import *
21
-
22
- # Load environment variables from .env file if present
23
- dotenv.load_dotenv()
24
-
25
- # Get the OPENAI_API_KEY from the environment variable
26
- # print(f"API_KEY: {OPENAI_API_KEY}")
27
-
28
- # Set your OpenAI API Key (better: use Colab "Secrets" to avoid hardcoding)
29
-
30
-
31
-
32
- # === CONFIG ===
33
- ENDPOINT_URL = "https://s3.us-west-1.idrivee2.com"
34
- ACCESS_KEY = "rNuPBAQetemqpEeBospZ"
35
- SECRET_KEY = "BU4FccUYxzXVqiWjPSJM1CWEX1cNhBqbU9NeGidE"
36
- BUCKET_NAME = "accusagas3" # replace with your actual bucket
37
-
38
- # === INITIALIZE CLIENT ===
39
- s3 = boto3.client(
40
- "s3",
41
- endpoint_url=ENDPOINT_URL,
42
- aws_access_key_id=ACCESS_KEY,
43
- aws_secret_access_key=SECRET_KEY,
44
- )
45
-
46
-
47
- def read_csv_from_s3(path: str) -> pd.DataFrame:
48
- """
49
- Reads a CSV file from iDrive e2 (S3) directly into a Pandas DataFrame.
50
-
51
- Args:
52
- path (str): The S3 key/path, e.g. "vatsav_123/reports/Gold Futures Historical Data.csv"
53
-
54
- Returns:
55
- pd.DataFrame: Loaded DataFrame from the CSV
56
- """
57
- try:
58
- # Fetch object from bucket
59
- response = s3.get_object(Bucket=BUCKET_NAME, Key=path)
60
-
61
- # Read the content
62
- csv_data = response["Body"].read().decode("utf-8")
63
-
64
- # Convert to DataFrame
65
- df = pd.read_csv(StringIO(csv_data))
66
-
67
- print(f"✅ Successfully loaded {path} into DataFrame")
68
- return df
69
-
70
- except ClientError as e:
71
- if e.response["Error"]["Code"] == "NoSuchKey":
72
- print(f" File not found in S3: {path}")
73
- else:
74
- print(f"❌ S3 error: {e}")
75
- return pd.DataFrame() # return empty dataframe on failure
76
- except Exception as e:
77
- print(f"❌ Unexpected error: {e}")
78
- return pd.DataFrame()
79
-
80
-
81
- def create_file_metadata_from_df(df: pd.DataFrame, file_name: str, file_path: str) -> dict:
82
- """
83
- Generate metadata from a DataFrame (directly from S3).
84
- """
85
- # --- Basic File Info ---
86
- file_type = "csv"
87
- file_size = len(df)
88
- created_date = datetime.datetime.now().strftime("%Y-%m-%d")
89
- modified_date = created_date # For this example, we use current date as a placeholder
90
-
91
- # --- Extract Structured Info from DataFrame ---
92
- all_columns_list = df.columns.tolist()
93
- data_types = {col: str(df[col].dtype) for col in df.columns}
94
-
95
- sheet_info = [{
96
- "sheet_name": "CSV",
97
- "num_rows": len(df),
98
- "num_columns": len(df.columns),
99
- "columns": df.columns.tolist(),
100
- "sample_data": df.head(3).to_dict(orient="records")
101
- }]
102
-
103
- # --- Text preview for LLM ---
104
- file_preview = df.head(5).to_string() # Preview the first 5 rows as a text snippet
105
-
106
- # --- OpenAI LLM for summary + tags ---
107
- llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=os.environ["OPENAI_API_KEY"])
108
-
109
- prompt = PromptTemplate(
110
- input_variables=["file_name", "file_type", "columns", "sheets", "preview"],
111
- template="""
112
- You are a metadata extractor for data files.
113
- Given this file:
114
- - File Name: {file_name}
115
- - File Type: {file_type}
116
- - Sheets: {sheets}
117
- - Columns: {columns}
118
- - Preview Content: {preview}
119
-
120
- Generate strictly valid JSON with keys:
121
- - summary_text: A concise 2-3 sentence description of the file.
122
- - tags: 8-12 recommended metadata search keywords. These tags should be:
123
- * Derived from the file name, sheet names, column names, and content
124
- * Consider the inferred data domain (healthcare, finance, etc.)
125
- * Focus on analysis use cases (e.g., "trends", "forecasting", "customer behavior")
126
- """
127
- )
128
-
129
- ai_response = llm.predict(prompt.format(file_name=file_name, file_type=file_type, columns=all_columns_list, sheets=sheet_info, preview=file_preview))
130
-
131
- # Try parsing the AI response to valid JSON
132
- try:
133
- ai_json = json.loads(ai_response)
134
- except:
135
- # In case the AI response is not valid JSON, fallback to simple text
136
- ai_json = {"summary_text": ai_response.strip(), "tags": ["data", "file", "metadata"]}
137
-
138
- # --- Build Final Metadata JSON ---
139
- metadata = {
140
- "file_name": file_name,
141
- "file_path": file_path,
142
- "file_type": file_type,
143
- "file_size_bytes": file_size,
144
- "num_sheets": len(sheet_info),
145
- "sheets": sheet_info,
146
- "all_columns_list": list(set(all_columns_list)),
147
- "data_types": data_types,
148
- "tags": ai_json.get("tags", []),
149
- "summary_text": ai_json.get("summary_text", ""),
150
- "created_date": created_date,
151
- "modified_date": modified_date,
152
- "creator": "system_user"
153
- }
154
-
155
- return metadata
156
-
157
-
158
- # Example usage:
159
- # path = "vatsav/csv/Gold Futures Historical Data.csv"
160
- # df = read_csv_from_s3(path)
161
- # file_name = "Gold Futures Historical Data.csv"
162
- # file_path = path # In case you want to keep the S3 path
163
-
164
- # # Now we use the new function to generate metadata
165
- # metadata = create_file_metadata_from_df(df, file_name, file_path)
166
- # # print("+"*60)
167
- # print(json.dumps(metadata, indent=2))
 
 
1
+ import os
2
+ import io
3
+ import json
4
+ import datetime
5
+ import pandas as pd
6
+ from langchain_openai import OpenAIEmbeddings
7
+
8
+ from langchain.prompts import PromptTemplate
9
+ import boto3
10
+ from io import StringIO
11
+ from botocore.exceptions import ClientError
12
+ import dotenv
13
+ import os
14
+ import sys
15
+
16
+ from pathlib import Path
17
+ from typing import List
18
+ PROJECT_ROOT = Path(__file__).resolve().parents[1]
19
+ if str(PROJECT_ROOT) not in sys.path:
20
+ sys.path.insert(0, str(PROJECT_ROOT))
21
+ from retrieve_secret import *
22
+
23
+ # Load environment variables from .env file if present
24
+ dotenv.load_dotenv()
25
+
26
+ # Get the OPENAI_API_KEY from the environment variable
27
+ # print(f"API_KEY: {OPENAI_API_KEY}")
28
+
29
+ # Set your OpenAI API Key (better: use Colab "Secrets" to avoid hardcoding)
30
+
31
+
32
+
33
+ # === CONFIG ===
34
+ ENDPOINT_URL = "https://s3.us-west-1.idrivee2.com"
35
+ ACCESS_KEY = "rNuPBAQetemqpEeBospZ"
36
+ SECRET_KEY = "BU4FccUYxzXVqiWjPSJM1CWEX1cNhBqbU9NeGidE"
37
+ BUCKET_NAME = "accusagas3" # replace with your actual bucket
38
+
39
+ # === INITIALIZE CLIENT ===
40
+ s3 = boto3.client(
41
+ "s3",
42
+ endpoint_url=ENDPOINT_URL,
43
+ aws_access_key_id=ACCESS_KEY,
44
+ aws_secret_access_key=SECRET_KEY,
45
+ )
46
+
47
+
48
+ def read_csv_from_s3(path: str) -> pd.DataFrame:
49
+ """
50
+ Reads a CSV file from iDrive e2 (S3) directly into a Pandas DataFrame.
51
+
52
+ Args:
53
+ path (str): The S3 key/path, e.g. "vatsav_123/reports/Gold Futures Historical Data.csv"
54
+
55
+ Returns:
56
+ pd.DataFrame: Loaded DataFrame from the CSV
57
+ """
58
+ try:
59
+ # Fetch object from bucket
60
+ response = s3.get_object(Bucket=BUCKET_NAME, Key=path)
61
+
62
+ # Read the content
63
+ csv_data = response["Body"].read().decode("utf-8")
64
+
65
+ # Convert to DataFrame
66
+ df = pd.read_csv(StringIO(csv_data))
67
+
68
+ print(f"✅ Successfully loaded {path} into DataFrame")
69
+ return df
70
+
71
+ except ClientError as e:
72
+ if e.response["Error"]["Code"] == "NoSuchKey":
73
+ print(f"❌ File not found in S3: {path}")
74
+ else:
75
+ print(f"❌ S3 error: {e}")
76
+ return pd.DataFrame() # return empty dataframe on failure
77
+ except Exception as e:
78
+ print(f"❌ Unexpected error: {e}")
79
+ return pd.DataFrame()
80
+
81
+
82
+ def create_file_metadata_from_df(df: pd.DataFrame, file_name: str, file_path: str) -> dict:
83
+ """
84
+ Generate metadata from a DataFrame (directly from S3).
85
+ """
86
+ # --- Basic File Info ---
87
+ file_type = "csv"
88
+ file_size = len(df)
89
+ created_date = datetime.datetime.now().strftime("%Y-%m-%d")
90
+ modified_date = created_date # For this example, we use current date as a placeholder
91
+
92
+ # --- Extract Structured Info from DataFrame ---
93
+ all_columns_list = df.columns.tolist()
94
+ data_types = {col: str(df[col].dtype) for col in df.columns}
95
+
96
+ sheet_info = [{
97
+ "sheet_name": "CSV",
98
+ "num_rows": len(df),
99
+ "num_columns": len(df.columns),
100
+ "columns": df.columns.tolist(),
101
+ "sample_data": df.head(3).to_dict(orient="records")
102
+ }]
103
+
104
+ # --- Text preview for LLM ---
105
+ file_preview = df.head(5).to_string() # Preview the first 5 rows as a text snippet
106
+
107
+ # --- OpenAI LLM for summary + tags ---
108
+ llm = ChatOpenAI(model="gpt-4o-mini", openai_api_key=os.environ["OPENAI_API_KEY"])
109
+
110
+ prompt = PromptTemplate(
111
+ input_variables=["file_name", "file_type", "columns", "sheets", "preview"],
112
+ template="""
113
+ You are a metadata extractor for data files.
114
+ Given this file:
115
+ - File Name: {file_name}
116
+ - File Type: {file_type}
117
+ - Sheets: {sheets}
118
+ - Columns: {columns}
119
+ - Preview Content: {preview}
120
+
121
+ Generate strictly valid JSON with keys:
122
+ - summary_text: A concise 2-3 sentence description of the file.
123
+ - tags: 8-12 recommended metadata search keywords. These tags should be:
124
+ * Derived from the file name, sheet names, column names, and content
125
+ * Consider the inferred data domain (healthcare, finance, etc.)
126
+ * Focus on analysis use cases (e.g., "trends", "forecasting", "customer behavior")
127
+ """
128
+ )
129
+
130
+ ai_response = llm.predict(prompt.format(file_name=file_name, file_type=file_type, columns=all_columns_list, sheets=sheet_info, preview=file_preview))
131
+
132
+ # Try parsing the AI response to valid JSON
133
+ try:
134
+ ai_json = json.loads(ai_response)
135
+ except:
136
+ # In case the AI response is not valid JSON, fallback to simple text
137
+ ai_json = {"summary_text": ai_response.strip(), "tags": ["data", "file", "metadata"]}
138
+
139
+ # --- Build Final Metadata JSON ---
140
+ metadata = {
141
+ "file_name": file_name,
142
+ "file_path": file_path,
143
+ "file_type": file_type,
144
+ "file_size_bytes": file_size,
145
+ "num_sheets": len(sheet_info),
146
+ "sheets": sheet_info,
147
+ "all_columns_list": list(set(all_columns_list)),
148
+ "data_types": data_types,
149
+ "tags": ai_json.get("tags", []),
150
+ "summary_text": ai_json.get("summary_text", ""),
151
+ "created_date": created_date,
152
+ "modified_date": modified_date,
153
+ "creator": "system_user"
154
+ }
155
+
156
+ return metadata
157
+
158
+
159
+ # Example usage:
160
+ # path = "vatsav/csv/Gold Futures Historical Data.csv"
161
+ # df = read_csv_from_s3(path)
162
+ # file_name = "Gold Futures Historical Data.csv"
163
+ # file_path = path # In case you want to keep the S3 path
164
+
165
+ # # Now we use the new function to generate metadata
166
+ # metadata = create_file_metadata_from_df(df, file_name, file_path)
167
+ # # print("+"*60)
168
+ # print(json.dumps(metadata, indent=2))