nharshavardhana commited on
Commit
db2dba7
ยท
1 Parent(s): 73a67c7
Files changed (2) hide show
  1. app.py +389 -0
  2. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import hdbscan
4
+ import openai
5
+ from openai import OpenAI
6
+ from sentence_transformers import SentenceTransformer
7
+ import umap
8
+ import ast
9
+ import markdown
10
+ from pathlib import Path
11
+ import gradio as gr
12
+
13
+
14
+ # ---------------------------------------------------------
15
+ # Load API key
16
+ # ---------------------------------------------------------
17
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
18
+
19
+
20
+ # ---------------------------------------------------------
21
+ # Input loader: handles raw text, file path, or uploaded file
22
+ # ---------------------------------------------------------
23
+ def load_input(source):
24
+ # Path string โ†’ load file
25
+ if isinstance(source, str) and os.path.isfile(source):
26
+ with open(source, "r", encoding="utf-8") as f:
27
+ return f.read()
28
+
29
+ # Raw text
30
+ if isinstance(source, str):
31
+ return source
32
+
33
+ # Uploaded file (Colab or Claude)
34
+ if hasattr(source, "read"):
35
+ return source.read().decode("utf-8")
36
+
37
+ raise ValueError("Unsupported input type. Pass text, file path, or uploaded file.")
38
+
39
+
40
+ # ---------------------------------------------------------
41
+ # Expand notes for better embedding semantic separation
42
+ # ---------------------------------------------------------
43
+ def expand_note(note: str) -> str:
44
+ return (
45
+ f"This note says: '{note}'. "
46
+ "Interpret it as a possible work task, personal task, reminder, idea, or question. "
47
+ "Expand the hidden meaning so semantic embeddings become more distinguishable."
48
+ )
49
+
50
+
51
+ # ---------------------------------------------------------
52
+ # Clustering with UMAP + tuned HDBSCAN
53
+ # ---------------------------------------------------------
54
+ def cluster_embeddings(expanded_notes):
55
+ n = len(expanded_notes)
56
+
57
+ # If only 1 note โ†’ trivial cluster
58
+ if n == 1:
59
+ return {-1: [0]}
60
+
61
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
62
+ embeddings = model.encode(expanded_notes)
63
+
64
+ # Dimensionality reduction for cleaner clusters
65
+ reducer = umap.UMAP(
66
+ n_neighbors=5,
67
+ min_dist=0.1,
68
+ metric="cosine"
69
+ )
70
+ reduced = reducer.fit_transform(embeddings)
71
+
72
+ # Stronger clustering behavior
73
+ clusterer = hdbscan.HDBSCAN(
74
+ min_cluster_size=2,
75
+ min_samples=1,
76
+ cluster_selection_epsilon=0.2,
77
+ metric='euclidean'
78
+ )
79
+
80
+ labels = clusterer.fit_predict(reduced)
81
+
82
+ clusters = {}
83
+ for idx, label in enumerate(labels):
84
+ clusters.setdefault(int(label), []).append(idx)
85
+
86
+ return clusters
87
+
88
+
89
+ # ---------------------------------------------------------
90
+ # LLM cluster summarizer โ†’ always returns valid JSON
91
+ # ---------------------------------------------------------
92
+ def summarize_cluster_with_llm(raw_items):
93
+ prompt = f"""
94
+ You must return ONLY valid JSON.
95
+ No markdown. No backticks. No explanations.
96
+
97
+ Notes:
98
+ {raw_items}
99
+
100
+ Return JSON exactly like:
101
+ {{
102
+ "title": "...",
103
+ "summary": "..."
104
+ }}
105
+ """
106
+
107
+ response = client.chat.completions.create(
108
+ model="gpt-4.1-mini",
109
+ messages=[{"role": "user", "content": prompt}],
110
+ temperature=0.2,
111
+ )
112
+
113
+ json_text = response.choices[0].message.content.strip()
114
+
115
+ # Try direct JSON parse
116
+ try:
117
+ return json.loads(json_text)
118
+ except:
119
+ # Fallback: remove accidental formatting
120
+ cleaned = (
121
+ json_text.replace("```", "")
122
+ .replace("json", "")
123
+ .strip()
124
+ )
125
+ return json.loads(cleaned)
126
+
127
+
128
+ # ---------------------------------------------------------
129
+ # MAIN TOOL โ€” Works with text OR file path OR upload
130
+ # ---------------------------------------------------------
131
+ def cluster_notes_dynamic(input_data) -> dict:
132
+ """
133
+ Dynamically clusters unstructured notes into semantic groups using embeddings and HDBSCAN,
134
+ then summarizes each cluster with an LLM.
135
+
136
+ Args:
137
+ input_data (str or file-like):
138
+ - Raw multiline text containing notes, OR
139
+ - A file path to a text file, OR
140
+ - A file-like object uploaded in environments such as Colab or Gradio.
141
+
142
+ Returns:
143
+ dict: A JSON-like dictionary structure:
144
+ {
145
+ "clusters": [
146
+ {
147
+ "id": <cluster_id>,
148
+ "items": [list of notes],
149
+ "analysis": {
150
+ "title": "...",
151
+ "summary": "..."
152
+ }
153
+ },
154
+ ...
155
+ ]
156
+ }
157
+
158
+ Behavior:
159
+ - Automatically detects if input is text or file path.
160
+ - Expands each note for better semantic embedding separation.
161
+ - Uses SentenceTransformer embeddings + HDBSCAN for density-based clustering.
162
+ - Uses an LLM to generate a clean title and summary for each cluster.
163
+ - Returns strictly structured output for downstream formatting tools.
164
+
165
+ """
166
+
167
+ text = load_input(input_data)
168
+
169
+ # Parse text into notes
170
+ raw_notes = [l.strip() for l in text.split("\n") if l.strip()]
171
+
172
+ if not raw_notes:
173
+ return {"clusters": []}
174
+
175
+ expanded_notes = [expand_note(n) for n in raw_notes]
176
+ cluster_map = cluster_embeddings(expanded_notes)
177
+
178
+ results = []
179
+ for cid, idx_list in cluster_map.items():
180
+ items = [raw_notes[i] for i in idx_list]
181
+ analysis = summarize_cluster_with_llm(items)
182
+ results.append({
183
+ "id": cid,
184
+ "items": items,
185
+ "analysis": analysis
186
+ })
187
+
188
+ return {"clusters": results}
189
+
190
+
191
+
192
+ def convert_structure_to_markdown(structured_json: dict | str) -> str:
193
+ """
194
+ Converts a structured notes JSON object into a clean, readable Markdown document.
195
+
196
+ Args:
197
+ structured_json (dict | str):
198
+ Either a Python dictionary or a JSON string containing clustered notes
199
+ in the format produced by `cluster_notes_dynamic`.
200
+ Example structure:
201
+ {
202
+ "clusters": [
203
+ {
204
+ "id": 0,
205
+ "items": [...],
206
+ "analysis": {"title": "...", "summary": "..."}
207
+ }
208
+ ]
209
+ }
210
+
211
+ Returns:
212
+ str: A Markdown-formatted representation of all clusters, including
213
+ titles, summaries, and individual note items.
214
+ """
215
+
216
+ # Convert string input into dict
217
+ if isinstance(structured_json, str):
218
+ try:
219
+ structured_json = json.loads(structured_json)
220
+ except:
221
+ structured_json = ast.literal_eval(structured_json)
222
+
223
+ md = "# ๐Ÿ—‚ Structured Notes\n\n"
224
+
225
+ for cluster in structured_json["clusters"]:
226
+ title = cluster["analysis"]["title"]
227
+ summary = cluster["analysis"]["summary"]
228
+ items = cluster["items"]
229
+
230
+ md += f"## {title}\n"
231
+ md += f"{summary}\n\n"
232
+ md += "### Notes:\n"
233
+ for item in items:
234
+ md += f"- {item}\n"
235
+ md += "\n"
236
+
237
+ return md
238
+
239
+
240
+
241
+ def generate_minimal_google_font_html(md_text: str, font: str = "Inter") -> str:
242
+ """
243
+ Converts Markdown text into a simple, styled HTML document using a Google Font.
244
+
245
+ Args:
246
+ md_text (str):
247
+ The Markdown-formatted text to convert into HTML.
248
+ font (str, optional):
249
+ The Google Font to apply to the exported HTML.
250
+ Defaults to "Inter". If empty or None, "Inter" is used automatically.
251
+
252
+ Returns:
253
+ str:
254
+ The file path of the generated HTML file, which can be returned
255
+ directly to Gradio for download.
256
+ """
257
+
258
+ # If no font was provided
259
+ if not font or font.strip() == "":
260
+ font = "Inter"
261
+
262
+ # Convert markdown to HTML
263
+ html_body = markdown.markdown(md_text)
264
+
265
+ # Google Font URL (spaces replaced with +)
266
+ font_url = font.replace(" ", "+")
267
+
268
+ # Build final HTML
269
+ final_html = f"""
270
+ <!DOCTYPE html>
271
+ <html>
272
+ <head>
273
+ <meta charset="utf-8">
274
+ <title>Notes Export</title>
275
+ <link href="https://fonts.googleapis.com/css2?family={font_url}:wght@300;400;600&display=swap" rel="stylesheet">
276
+ <style>
277
+ body {{
278
+ font-family: '{font}', sans-serif;
279
+ max-width: 800px;
280
+ margin: 40px auto;
281
+ padding: 20px;
282
+ line-height: 1.6;
283
+ color: #222;
284
+ }}
285
+ h1, h2, h3 {{
286
+ font-weight: 600;
287
+ }}
288
+ ul {{
289
+ margin-left: 20px;
290
+ }}
291
+ </style>
292
+ </head>
293
+ <body>
294
+ {html_body}
295
+ </body>
296
+ </html>
297
+ """.strip()
298
+
299
+ # Save file
300
+ output_path = "/content/notes_export.html"
301
+ Path(output_path).write_text(final_html, encoding="utf-8")
302
+
303
+ return output_path
304
+
305
+
306
+
307
+
308
+ def cluster_notes_entry(text_input, file_input):
309
+ """
310
+ Wrapper function so Gradio can pass either text OR file.
311
+ """
312
+ # 1. If a file was uploaded
313
+ if file_input:
314
+ try:
315
+ # file_input is a temporary file path string
316
+ with open(file_input, "r", encoding="utf-8") as f:
317
+ content = f.read()
318
+ return cluster_notes_dynamic(content)
319
+ except Exception as e:
320
+ return f"Error reading file: {e}"
321
+
322
+ # 2. If raw text was entered
323
+ if text_input and text_input.strip():
324
+ return cluster_notes_dynamic(text_input)
325
+
326
+ return "Please enter text or upload a file."
327
+
328
+
329
+ notes_interface = gr.Interface(
330
+ fn=cluster_notes_entry,
331
+ inputs=[
332
+ gr.Textbox(
333
+ label="Enter notes (one per line)",
334
+ placeholder="Need to call my brother\nSend email\nResearch project",
335
+ lines=5
336
+ ),
337
+ gr.File(
338
+ label="Upload notes file (.txt)",
339
+ file_types=["text"]
340
+ )
341
+ ],
342
+ outputs=gr.Textbox(label="Clustered Output", lines=20),
343
+ api_name="cluster_notes_dynamic"
344
+ )
345
+
346
+ markdown_interface = gr.Interface(
347
+ fn=convert_structure_to_markdown,
348
+ inputs=gr.Textbox(label="Clustered input"),
349
+ outputs=gr.Textbox(label="Markdown output",lines=20),
350
+ api_name="convert_structure_to_markdown"
351
+ )
352
+
353
+ html_interface = gr.Interface(
354
+ fn=generate_minimal_google_font_html,
355
+ inputs=[
356
+ gr.Textbox(label="Markdown Input", lines=12, placeholder="# Your Markdown here..."),
357
+ gr.Textbox(label="Google Font (optional)", placeholder="Inter (default)")
358
+ ],
359
+ outputs=gr.File(label="Download HTML"),
360
+ title="Markdown โ†’ Styled HTML Converter",
361
+ description="Converts markdown into a clean HTML file styled with Google Fonts."
362
+ )
363
+
364
+
365
+
366
+ with gr.Blocks(title="NeatNote: A smart note-clustering MCP server that transforms unstructured text into clear, organized insights using semantic embeddings and LLM summaries.") as demo:
367
+ gr.Markdown("""
368
+ # NeatNote ๐Ÿ—‚
369
+ """)
370
+
371
+ gr.TabbedInterface(
372
+ [
373
+ notes_interface,
374
+ markdown_interface,
375
+ html_interface
376
+ # Add more tools here
377
+ ],
378
+ [
379
+ "notes_interface",
380
+ "markdown_interface",
381
+ "html_interface"
382
+ # Add more tool tabs here
383
+
384
+ ]
385
+ )
386
+
387
+
388
+ if __name__ == "__main__":
389
+ demo.launch(mcp_server=True)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ sentence-transformers
3
+ hdbscan
4
+ umap-learn
5
+ markdown
6
+ gradio
7
+ numpy
8
+ scikit-learn
9
+ scipy
10
+ pandas