triflix commited on
Commit
81f703a
·
verified ·
1 Parent(s): e8c15b1

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +20 -0
  2. app.py +252 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.9-slim-buster
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Install any needed packages specified in requirements.txt
8
+ # First, copy just the requirements.txt to leverage Docker cache
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Copy the rest of the application code
13
+ COPY . .
14
+
15
+ # Expose port 7860 as requested by the user for Hugging Face Spaces
16
+ EXPOSE 7860
17
+
18
+ # Command to run the application
19
+ # Use Uvicorn to run FastAPI, binding to 0.0.0.0 and the exposed port
20
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import json
4
+ import pandas as pd
5
+ import base64
6
+ from google import genai
7
+ from google.genai import types
8
+ from fastapi import FastAPI, UploadFile, File, HTTPException
9
+ from typing import List
10
+
11
+ app = FastAPI()
12
+
13
+ # Define a temporary directory for file storage
14
+ TMP_DIR = "/tmp/fastapi_files"
15
+
16
+ @app.on_event("startup")
17
+ async def startup_event():
18
+ """Create the temporary directory on startup if it doesn't exist."""
19
+ os.makedirs(TMP_DIR, exist_ok=True)
20
+
21
+ @app.on_event("shutdown")
22
+ async def shutdown_event():
23
+ """Clean up the temporary directory on shutdown."""
24
+ if os.path.exists(TMP_DIR):
25
+ shutil.rmtree(TMP_DIR)
26
+
27
+ def load_file(path: str):
28
+ ext = os.path.splitext(path)[-1].lower()
29
+ if ext == ".csv":
30
+ df = pd.read_csv(path)
31
+ elif ext in [".xls", ".xlsx"]:
32
+ # For API, we cannot interactively ask for sheet number.
33
+ # We'll assume the first sheet or require sheet_name as a parameter if needed.
34
+ # For now, let's just load the first sheet.
35
+ df = pd.read_excel(path, sheet_name=0)
36
+ else:
37
+ raise ValueError("Unsupported file type")
38
+ return df.copy()
39
+
40
+ def preprocess(df, drop_thresh=0.5):
41
+ df = df.copy()
42
+ df.columns = [str(c).strip().lower().replace(" ", "_") for c in df.columns]
43
+ df = df.loc[:, df.isnull().mean() < drop_thresh]
44
+ for col in df.columns:
45
+ if pd.api.types.is_numeric_dtype(df[col]):
46
+ df.loc[:, col] = df[col].fillna(df[col].median())
47
+ elif pd.api.types.is_datetime64_any_dtype(df[col]):
48
+ df.loc[:, col] = df[col].fillna(pd.Timestamp('1970-01-01'))
49
+ else:
50
+ df.loc[:, col] = df[col].fillna("Unknown")
51
+ for col in df.columns:
52
+ if df[col].dtype == 'object':
53
+ try:
54
+ df.loc[:, col] = pd.to_numeric(df[col])
55
+ except:
56
+ pass
57
+ df = df.drop_duplicates()
58
+ return df
59
+
60
+ def metadata(df):
61
+ return {
62
+ "rows": df.shape[0],
63
+ "columns": df.shape[1],
64
+ "column_names": list(df.columns),
65
+ "column_types": df.dtypes.astype(str).to_dict(),
66
+ "unique_values": {col: df[col].nunique() for col in df.columns}
67
+ }
68
+
69
+ def generate_summary(meta, fiverow):
70
+ client = genai.Client(api_key="AIzaSyDLa5cYGVVLVvKHuzBWVKJ-UtfQ7NgpRK0") # Use environment variable for API key
71
+ model = "gemini-2.5-flash-lite"
72
+
73
+ # direct structured system instruction enhanced with multiple layout templates
74
+ system_prompt = """
75
+ You are a strict JSON generator.
76
+ Input contains:
77
+ - meta: dataframe metadata
78
+ - fiverow: first 5 records of dataframe
79
+
80
+ You must output JSON with the following structure:
81
+ {
82
+ "summary": "<short natural language overview of dataset>",
83
+ "recommended_charts": [
84
+ {
85
+ "type": "<one of: bar, pie, timeseries, histogram, scatter, multiple_columns, stacked_bar, heatmap>",
86
+ "title": "<short title for chart>",
87
+ "columns": ["<col1>", "<col2>", "..."],
88
+ "python_code": "<full runnable Python code using seaborn/matplotlib that produces the chart>"
89
+ },
90
+ ...
91
+ ]
92
+ }
93
+
94
+ Mandatory rules:
95
+ - Always produce syntactically valid JSON ONLY. No text outside the JSON object.
96
+ - Provide at least these chart types somewhere in recommended_charts: bar, pie, timeseries, histogram, scatter, multiple_columns, stacked_bar, heatmap.
97
+ - Use only column names that appear in meta['column_names'].
98
+ - The python_code string must be self-contained and runnable assuming a variable `df` exists containing the full cleaned DataFrame. Start the code with imports:
99
+ import pandas as pd
100
+ import seaborn as sns
101
+ import matplotlib.pyplot as plt
102
+ and include any necessary preprocessing steps (e.g., parsing dates).
103
+ - For timeseries charts ensure the datetime column is parsed (`pd.to_datetime`) before plotting.
104
+ - For multiple_columns provide a pairplot or facetgrid example that uses up to 4 numeric columns or sensible categorical splits.
105
+ - For stacked_bar, show aggregation code (groupby + unstack) and plotting with df.plot(kind='bar', stacked=True).
106
+ - For heatmap, compute correlation matrix and plot sns.heatmap with annotations.
107
+ - For pie charts, ensure grouping/aggregation when there are >20 unique categories (group small categories into 'Other').
108
+ - For histogram and scatter include axis labels and tight_layout; include plt.show() at the end.
109
+ - Keep code minimal but complete so a user can copy-paste and run (assume seaborn, matplotlib, pandas installed).
110
+ - For each chart add a sensible "columns" list showing which columns the code uses.
111
+ - Do not include examples using columns not present in meta.
112
+ - Do not include more than 10 recommended_charts.
113
+ - Ensure strings inside the JSON are escaped properly so the JSON parses.
114
+
115
+ Produce concise natural-language one-line summary in "summary". Ensure JSON is parseable by json.loads in Python.
116
+ """
117
+
118
+ user_prompt = {
119
+ "meta": meta,
120
+ "fiverow": fiverow
121
+ }
122
+
123
+ contents = [
124
+ types.Content(
125
+ role="user",
126
+ parts=[types.Part.from_text(text=str(user_prompt))],
127
+ ),
128
+ ]
129
+
130
+ generate_content_config = types.GenerateContentConfig(
131
+ thinking_config=types.ThinkingConfig(thinking_budget=0),
132
+ response_mime_type="application/json",
133
+ system_instruction=[types.Part.from_text(text=system_prompt)],
134
+ )
135
+
136
+ response = ""
137
+ for chunk in client.models.generate_content_stream(
138
+ model=model,
139
+ contents=contents,
140
+ config=generate_content_config,
141
+ ):
142
+ if chunk.text:
143
+ response += chunk.text
144
+ return response
145
+
146
+
147
+ @app.get("/")
148
+ async def read_root():
149
+ return {"message": "Welcome to the FastAPI Hugging Face Space API with Data Analysis!"}
150
+
151
+ @app.post("/analyze_data/")
152
+ async def analyze_data(file: UploadFile = File(...)):
153
+ """
154
+ Uploads a file, preprocesses it, and generates a summary and recommended charts.
155
+ """
156
+ file_path = os.path.join(TMP_DIR, file.filename)
157
+ try:
158
+ # Save the uploaded file to the temporary directory
159
+ with open(file_path, "wb") as buffer:
160
+ shutil.copyfileobj(file.file, buffer)
161
+
162
+ # Load and preprocess the file
163
+ df = load_file(file_path)
164
+ df_clean = preprocess(df)
165
+
166
+ # Generate metadata and first 5 rows
167
+ meta = metadata(df_clean)
168
+ fiverow = df_clean.head(5).to_dict(orient="records")
169
+
170
+ # Generate summary and charts using the AI model
171
+ summary_json = generate_summary(meta, fiverow)
172
+
173
+ # Clean up the uploaded file after processing
174
+ os.remove(file_path)
175
+
176
+ return json.loads(summary_json) # Return the parsed JSON response
177
+ except ValueError as ve:
178
+ raise HTTPException(status_code=400, detail=str(ve))
179
+ except Exception as e:
180
+ raise HTTPException(status_code=500, detail=f"An error occurred during data analysis: {e}")
181
+
182
+ # The following endpoints are kept for general file management but are not directly used by the new /analyze_data endpoint.
183
+ # They can be removed if not needed, or modified to work with the /tmp directory.
184
+ @app.post("/uploadfile/")
185
+ async def create_upload_file(file: UploadFile = File(...)):
186
+ """
187
+ Uploads a single file to the temporary directory.
188
+ """
189
+ file_path = os.path.join(TMP_DIR, file.filename)
190
+ try:
191
+ with open(file_path, "wb") as buffer:
192
+ shutil.copyfileobj(file.file, buffer)
193
+ return {"filename": file.filename, "message": f"File '{file.filename}' uploaded successfully to {TMP_DIR}"}
194
+ except Exception as e:
195
+ raise HTTPException(status_code=500, detail=f"Could not upload file: {e}")
196
+
197
+ @app.post("/uploadfiles/")
198
+ async def create_upload_files(files: List[UploadFile] = File(...)):
199
+ """
200
+ Uploads multiple files to the temporary directory.
201
+ """
202
+ uploaded_files = []
203
+ for file in files:
204
+ file_path = os.path.join(TMP_DIR, file.filename)
205
+ try:
206
+ with open(file_path, "wb") as buffer:
207
+ shutil.copyfileobj(file.file, buffer)
208
+ uploaded_files.append({"filename": file.filename, "path": file_path})
209
+ except Exception as e:
210
+ raise HTTPException(status_code=500, detail=f"Could not upload file '{file.filename}': {e}")
211
+ return {"message": f"Successfully uploaded {len(uploaded_files)} files to {TMP_DIR}", "files": uploaded_files}
212
+
213
+ @app.get("/list_files/")
214
+ async def list_uploaded_files():
215
+ """
216
+ Lists all files currently in the temporary directory.
217
+ """
218
+ if not os.path.exists(TMP_DIR):
219
+ return {"message": "Temporary directory does not exist or is empty."}
220
+
221
+ files = os.listdir(TMP_DIR)
222
+ return {"files": files, "path": TMP_DIR}
223
+
224
+ @app.get("/download_file/{filename}")
225
+ async def download_file(filename: str):
226
+ """
227
+ Downloads a specific file from the temporary directory.
228
+ """
229
+ file_path = os.path.join(TMP_DIR, filename)
230
+ if not os.path.exists(file_path):
231
+ raise HTTPException(status_code=404, detail="File not found.")
232
+
233
+ # In a real application, you would return a FileResponse here.
234
+ # For this example, we'll just confirm the file exists.
235
+ return {"message": f"File '{filename}' found at {file_path}. In a real app, this would be downloaded."}
236
+
237
+ @app.post("/process_file/{filename}")
238
+ async def process_file_data(filename: str):
239
+ """
240
+ Example endpoint to process data from an uploaded file.
241
+ This assumes the file is already uploaded to the temporary directory.
242
+ """
243
+ file_path = os.path.join(TMP_DIR, filename)
244
+ if not os.path.exists(file_path):
245
+ raise HTTPException(status_code=404, detail="File not found. Please upload it first.")
246
+
247
+ try:
248
+ with open(file_path, "r") as f:
249
+ content = f.readlines()[:5] # Read first 5 lines
250
+ return {"filename": filename, "processed_content_sample": content, "message": "File processed successfully."}
251
+ except Exception as e:
252
+ raise HTTPException(status_code=500, detail=f"Error processing file: {e}")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ python-multipart
4
+ pandas
5
+ google-generativeai
6
+ seaborn
7
+ matplotlib