ProfessionalMario commited on
Commit
b2fb95a
·
verified ·
1 Parent(s): 8a31448

Upload 6 files

Browse files
Files changed (6) hide show
  1. .gitattributes +3 -35
  2. .gitignore +231 -0
  3. Dockerfile +0 -0
  4. README.md +92 -17
  5. app.py +83 -69
  6. requirements.txt +9 -0
.gitattributes CHANGED
@@ -1,35 +1,3 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ project.tar.gz filter=lfs diff=lfs merge=lfs -text
2
+ vector_store/analyze_embeddings.pkl filter=lfs diff=lfs merge=lfs -text
3
+ *.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+
10
+ # Distribution / packaging
11
+ logs/*.log
12
+ *.joblib
13
+ data/raw/
14
+ .ipynb_checkpoints/
15
+ .Python
16
+ .vscode/
17
+ build/
18
+ develop-eggs/
19
+ dist/
20
+ downloads/
21
+ eggs/
22
+ .eggs/
23
+ lib/
24
+ lib64/
25
+ parts/
26
+ sdist/
27
+ var/
28
+ wheels/
29
+ share/python-wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+ MANIFEST
34
+
35
+ # PyInstaller
36
+ # Usually these files are written by a python script from a template
37
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
+ *.manifest
39
+ *.spec
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # Unit test / coverage reports
46
+ htmlcov/
47
+ .tox/
48
+ .nox/
49
+ .coverage
50
+ .coverage.*
51
+ .cache
52
+ nosetests.xml
53
+ coverage.xml
54
+ *.cover
55
+ *.py.cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+ cover/
59
+ csv_files/
60
+ csv_files.zip
61
+ data/datasets/
62
+ # vector_store/*.pkl
63
+ output/
64
+ logs/
65
+
66
+ # Translations
67
+ *.mo
68
+ *.pot
69
+ # Raw CSVs
70
+ csv_files/
71
+ csv_files.zip
72
+
73
+ # Processed datasets
74
+ data/datasets/
75
+
76
+ # Outputs & logs
77
+ output/
78
+ logs/
79
+
80
+ # Embeddings / binaries
81
+ # vector_store/*.pkl
82
+ *.parquet
83
+ *.feather
84
+ *.xlsx
85
+
86
+ # Django stuff:
87
+ *.log
88
+ local_settings.py
89
+ db.sqlite3
90
+ db.sqlite3-journal
91
+
92
+ # Flask stuff:
93
+ instance/
94
+ .webassets-cache
95
+
96
+ # Scrapy stuff:
97
+ .scrapy
98
+
99
+ # Sphinx documentation
100
+ docs/_build/
101
+
102
+ # PyBuilder
103
+ .pybuilder/
104
+ target/
105
+
106
+ # Jupyter Notebook
107
+ .ipynb_checkpoints
108
+
109
+ # IPython
110
+ profile_default/
111
+ ipython_config.py
112
+
113
+ # pyenv
114
+ # For a library or package, you might want to ignore these files since the code is
115
+ # intended to run in multiple environments; otherwise, check them in:
116
+ # .python-version
117
+
118
+ # pipenv
119
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
120
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
121
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
122
+ # install all needed dependencies.
123
+ #Pipfile.lock
124
+
125
+ # UV
126
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
127
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
128
+ # commonly ignored for libraries.
129
+ #uv.lock
130
+
131
+ # poetry
132
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
133
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
134
+ # commonly ignored for libraries.
135
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
136
+ #poetry.lock
137
+ #poetry.toml
138
+
139
+ # pdm
140
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
141
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
142
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
143
+ #pdm.lock
144
+ #pdm.toml
145
+ .pdm-python
146
+ .pdm-build/
147
+
148
+ # pixi
149
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
150
+ #pixi.lock
151
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
152
+ # in the .venv directory. It is recommended not to include this directory in version control.
153
+ .pixi
154
+
155
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
156
+ __pypackages__/
157
+
158
+ # Celery stuff
159
+ celerybeat-schedule
160
+ celerybeat.pid
161
+
162
+ # SageMath parsed files
163
+ *.sage.py
164
+
165
+ # Environments
166
+ .env
167
+ .envrc
168
+ .venv
169
+ env/
170
+ venv/
171
+ ENV/
172
+ env.bak/
173
+ venv.bak/
174
+
175
+ # Spyder project settings
176
+ .spyderproject
177
+ .spyproject
178
+
179
+ # Rope project settings
180
+ .ropeproject
181
+
182
+ # mkdocs documentation
183
+ /site
184
+
185
+ # mypy
186
+ .mypy_cache/
187
+ .dmypy.json
188
+ dmypy.json
189
+
190
+ # Pyre type checker
191
+ .pyre/
192
+
193
+ # pytype static type analyzer
194
+ .pytype/
195
+
196
+ # Cython debug symbols
197
+ cython_debug/
198
+
199
+ # PyCharm
200
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
201
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
202
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
203
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
204
+ #.idea/
205
+
206
+ # Abstra
207
+ # Abstra is an AI-powered process automation framework.
208
+ # Ignore directories containing user credentials, local state, and settings.
209
+ # Learn more at https://abstra.io/docs
210
+ .abstra/
211
+
212
+ # Visual Studio Code
213
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
214
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
215
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
216
+ # you could uncomment the following to ignore the entire vscode folder
217
+ # .vscode/
218
+
219
+ # Ruff stuff:
220
+ .ruff_cache/
221
+
222
+ # PyPI configuration file
223
+ .pypirc
224
+
225
+ # Cursor
226
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
227
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
228
+ # refer to https://docs.cursor.com/context/ignore-files
229
+ .cursorignore
230
+ .cursorindexingignore
231
+
Dockerfile ADDED
File without changes
README.md CHANGED
@@ -1,17 +1,92 @@
1
- ---
2
- title: EDA Explorer
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 6.5.1
8
- app_file: app.py
9
- pinned: false
10
- hf_oauth: true
11
- hf_oauth_scopes:
12
- - inference-api
13
- license: mit
14
- short_description: 'Performs EDA Operations '
15
- ---
16
-
17
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚀 EDA Explorer – AI-Powered Data Analysis CLI
2
+
3
+ A lightweight CLI tool that automates exploratory data analysis (EDA) with intelligent insights, feature importance detection, and data quality checks.
4
+
5
+ Designed to simulate how an **AI Data Analyst** works on real-world datasets used in EDA.
6
+
7
+ ---
8
+
9
+ ## ⚡ Key Highlights
10
+
11
+ - 🔍 One-command analysis → `analyze <dataset>`
12
+ - 🧠 Auto target detection for ML-based insights
13
+ - 📈 Feature importance (no manual setup)
14
+ - ⚠️ Smart data warnings (missing, ID columns, constants)
15
+ - 📊 Correlation & outlier detection
16
+ - 📁 Auto report generation (.txt)
17
+ - Efficient handling of large datasets (Parquet + sampling)
18
+
19
+ ---
20
+
21
+ ## 🎬 Demo
22
+
23
+ 👉 Full demo: https://github.com/user-attachments/assets/7dff8329-71e8-4bca-ad01-404e75df8314
24
+
25
+ https://github.com/user-attachments/assets/7dff8329-71e8-4bca-ad01-404e75df8314
26
+
27
+ ---
28
+
29
+ ## 📊 Example Output
30
+
31
+ Top Correlations:
32
+ - age ↔ income: 0.72
33
+ - tenure ↔ balance: 0.65
34
+
35
+ ⚠️ Data Warnings:
36
+ - customer_id → likely ID column
37
+ - income → 52% missing values
38
+
39
+ 📈 Feature Importance:
40
+ - age: 0.41 (strong signal)
41
+ - tenure: 0.32 (strong signal)
42
+
43
+
44
+ ---
45
+
46
+ ## 🧠 What Makes It Stand Out
47
+
48
+ - Automatically identifies **useful vs irrelevant features**
49
+ - No manual preprocessing required
50
+ - Mimics real-world **data analyst reasoning**
51
+ - Built using a **modular agent-based system**
52
+
53
+ ---
54
+
55
+ ## ⚡ Performance
56
+
57
+ - Parquet-based storage for faster I/O
58
+ - Sampling strategy for large datasets
59
+
60
+ ---
61
+
62
+ ## 🛠️ System Design
63
+
64
+ - Command handler
65
+ - Dataset registry
66
+ - Modular agents (AnalysisAgent, etc.)
67
+ - Logger integration
68
+
69
+ ---
70
+
71
+ ## 📦 Datasets
72
+
73
+ - Titanic
74
+ - Customer Churn
75
+ - Credit Card Fraud
76
+
77
+ ---
78
+
79
+ ## 🛠️ Tech Stack
80
+
81
+ - Python
82
+ - Pandas, NumPy
83
+ - Scikit-learn
84
+ - Parquet
85
+
86
+ ---
87
+
88
+ ## 🚀 Future Enhancements
89
+
90
+ - RAG-based EDA advisor
91
+ - SQL query assistant
92
+ - Model training pipeline
app.py CHANGED
@@ -1,69 +1,83 @@
1
- import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
-
5
- def respond(
6
- message,
7
- history: list[dict[str, str]],
8
- system_message,
9
- max_tokens,
10
- temperature,
11
- top_p,
12
- hf_token: gr.OAuthToken,
13
- ):
14
- """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
16
- """
17
- client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
18
-
19
- messages = [{"role": "system", "content": system_message}]
20
-
21
- messages.extend(history)
22
-
23
- messages.append({"role": "user", "content": message})
24
-
25
- response = ""
26
-
27
- for message in client.chat_completion(
28
- messages,
29
- max_tokens=max_tokens,
30
- stream=True,
31
- temperature=temperature,
32
- top_p=top_p,
33
- ):
34
- choices = message.choices
35
- token = ""
36
- if len(choices) and choices[0].delta.content:
37
- token = choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- chatbot = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
-
62
- with gr.Blocks() as demo:
63
- with gr.Sidebar():
64
- gr.LoginButton()
65
- chatbot.render()
66
-
67
-
68
- if __name__ == "__main__":
69
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from rich.console import Console
4
+ from cli_app.command_handler import handle_command
5
+ from utils.logger import logger
6
+ from vector_store.instruction_embedder import embed_analyze_instructions
7
+ import os
8
+ import gradio as gr
9
+ from cli_app.command_handler import handle_command
10
+ from vector_store.instruction_embedder import embed_analyze_instructions
11
+
12
+ console = Console()
13
+
14
+ def run_web():
15
+ """The 'Space Way': A Gradio interface that acts like your CLI."""
16
+ import gradio as gr
17
+
18
+ def chat_interface(command):
19
+ # Calls your existing logic exactly like the CLI
20
+ return handle_command(command)
21
+
22
+ demo = gr.Interface(
23
+ fn=chat_interface,
24
+ inputs=gr.Textbox(label="EDA Command", placeholder="Type your command here..."),
25
+ outputs=gr.Code(label="Terminal Output", language="markdown"),
26
+ title="EDA Explorer",
27
+ description="Web terminal for EDA Explorer. Type 'help' or your analysis commands."
28
+ )
29
+ # HF Spaces uses port 7860
30
+ demo.launch(server_name="0.0.0.0", server_port=7860)
31
+
32
+ def run_cli():
33
+ embed_analyze_instructions()
34
+ console.print("\n[bold cyan]EDA Explorer[/bold cyan]")
35
+ console.print("Type 'exit' to quit\n")
36
+
37
+ while True:
38
+ try:
39
+ cmd = console.input("[bold yellow]> [/bold yellow]")
40
+ if cmd.lower() == "exit":
41
+ break
42
+
43
+ result = handle_command(cmd)
44
+ if result:
45
+ console.print(result)
46
+ except KeyboardInterrupt:
47
+ break
48
+ except Exception as e:
49
+ logger.error(f"CLI error | {e}")
50
+ console.print(f"Error: {e}")
51
+
52
+ def run_space_interface():
53
+ """The 'Space Way' using ChatInterface."""
54
+
55
+ # Pre-embed instructions just like your run_cli() does
56
+ embed_analyze_instructions()
57
+
58
+ def chat_response(message, history):
59
+ # 'message' is what the user typed in the box
60
+ # We pass it to your existing handler
61
+ result = handle_command(message)
62
+
63
+ if result == "exit":
64
+ return "Session ended. Refresh the page to restart."
65
+
66
+ return str(result) if result else "Command executed with no output."
67
+
68
+ # ChatInterface is the closest 'look and feel' to a CLI
69
+ demo = gr.ChatInterface(
70
+ fn=chat_response,
71
+ title="EDA Explorer Terminal",
72
+ description="Type your EDA commands below. Works just like the CLI version!",
73
+ examples=["help", "analyze data.csv", "status"] # Optional: suggest commands
74
+ )
75
+
76
+ demo.launch(server_name="0.0.0.0", server_port=7860)
77
+
78
+ if __name__ == "__main__":
79
+ if "SPACE_ID" in os.environ:
80
+ run_space_interface()
81
+ else:
82
+ run_cli()
83
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ matplotlib==3.10.9
2
+ numpy==2.4.5
3
+ pandas==3.0.3
4
+ plotext==5.3.2
5
+ psutil==7.2.2
6
+ Requests==2.34.2
7
+ rich==15.0.0
8
+ scikit_learn==1.8.0
9
+ sentence_transformers==5.3.0