NeatNote / app.py
nharshavardhana's picture
commit
b684c99
import os
import json
import hdbscan
import openai
from openai import OpenAI
from sentence_transformers import SentenceTransformer
import umap
import ast
import markdown
from pathlib import Path
import gradio as gr
# ---------------------------------------------------------
# Load API key
# ---------------------------------------------------------
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# ---------------------------------------------------------
# Input loader: handles raw text, file path, or uploaded file
# ---------------------------------------------------------
def load_input(source):
# Path string β†’ load file
if isinstance(source, str) and os.path.isfile(source):
with open(source, "r", encoding="utf-8") as f:
return f.read()
# Raw text
if isinstance(source, str):
return source
# Uploaded file (Colab or Claude)
if hasattr(source, "read"):
return source.read().decode("utf-8")
raise ValueError("Unsupported input type. Pass text, file path, or uploaded file.")
# ---------------------------------------------------------
# Expand notes for better embedding semantic separation
# ---------------------------------------------------------
def expand_note(note: str) -> str:
return (
f"This note says: '{note}'. "
"Interpret it as a possible work task, personal task, reminder, idea, or question. "
"Expand the hidden meaning so semantic embeddings become more distinguishable."
)
# ---------------------------------------------------------
# Clustering with UMAP + tuned HDBSCAN
# ---------------------------------------------------------
def cluster_embeddings(expanded_notes):
n = len(expanded_notes)
# If only 1 note β†’ trivial cluster
if n == 1:
return {-1: [0]}
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(expanded_notes)
# Dimensionality reduction for cleaner clusters
reducer = umap.UMAP(
n_neighbors=5,
min_dist=0.1,
metric="cosine"
)
reduced = reducer.fit_transform(embeddings)
# Stronger clustering behavior
clusterer = hdbscan.HDBSCAN(
min_cluster_size=2,
min_samples=1,
cluster_selection_epsilon=0.2,
metric='euclidean'
)
labels = clusterer.fit_predict(reduced)
clusters = {}
for idx, label in enumerate(labels):
clusters.setdefault(int(label), []).append(idx)
return clusters
# ---------------------------------------------------------
# LLM cluster summarizer β†’ always returns valid JSON
# ---------------------------------------------------------
def summarize_cluster_with_llm(raw_items):
prompt = f"""
You must return ONLY valid JSON.
No markdown. No backticks. No explanations.
Notes:
{raw_items}
Return JSON exactly like:
{{
"title": "...",
"summary": "..."
}}
"""
response = client.chat.completions.create(
model="gpt-4.1-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0.2,
)
json_text = response.choices[0].message.content.strip()
# Try direct JSON parse
try:
return json.loads(json_text)
except:
# Fallback: remove accidental formatting
cleaned = (
json_text.replace("```", "")
.replace("json", "")
.strip()
)
return json.loads(cleaned)
# ---------------------------------------------------------
# MAIN TOOL β€” Works with text OR file path OR upload
# ---------------------------------------------------------
def cluster_notes_dynamic(input_data) -> dict:
"""
Dynamically clusters unstructured notes into semantic groups using embeddings and HDBSCAN,
then summarizes each cluster with an LLM.
Args:
input_data (str or file-like):
- Raw multiline text containing notes, OR
- A file path to a text file, OR
- A file-like object uploaded in environments such as Colab or Gradio.
Returns:
dict: A JSON-like dictionary structure:
{
"clusters": [
{
"id": <cluster_id>,
"items": [list of notes],
"analysis": {
"title": "...",
"summary": "..."
}
},
...
]
}
Behavior:
- Automatically detects if input is text or file path.
- Expands each note for better semantic embedding separation.
- Uses SentenceTransformer embeddings + HDBSCAN for density-based clustering.
- Uses an LLM to generate a clean title and summary for each cluster.
- Returns strictly structured output for downstream formatting tools.
"""
text = load_input(input_data)
# Parse text into notes
raw_notes = [l.strip() for l in text.split("\n") if l.strip()]
if not raw_notes:
return {"clusters": []}
expanded_notes = [expand_note(n) for n in raw_notes]
cluster_map = cluster_embeddings(expanded_notes)
results = []
for cid, idx_list in cluster_map.items():
items = [raw_notes[i] for i in idx_list]
analysis = summarize_cluster_with_llm(items)
results.append({
"id": cid,
"items": items,
"analysis": analysis
})
return {"clusters": results}
def convert_structure_to_markdown(structured_json: dict | str) -> str:
"""
Converts a structured notes JSON object into a clean, readable Markdown document.
Args:
structured_json (dict | str):
Either a Python dictionary or a JSON string containing clustered notes
in the format produced by `cluster_notes_dynamic`.
Example structure:
{
"clusters": [
{
"id": 0,
"items": [...],
"analysis": {"title": "...", "summary": "..."}
}
]
}
Returns:
str: A Markdown-formatted representation of all clusters, including
titles, summaries, and individual note items.
"""
# Convert string input into dict
if isinstance(structured_json, str):
try:
structured_json = json.loads(structured_json)
except:
structured_json = ast.literal_eval(structured_json)
md = "# πŸ—‚ Structured Notes\n\n"
for cluster in structured_json["clusters"]:
title = cluster["analysis"]["title"]
summary = cluster["analysis"]["summary"]
items = cluster["items"]
md += f"## {title}\n"
md += f"{summary}\n\n"
md += "### Notes:\n"
for item in items:
md += f"- {item}\n"
md += "\n"
return md
def generate_minimal_google_font_html(md_text: str, font: str = "Inter") -> str:
"""
Converts Markdown text into a simple, styled HTML document using a Google Font.
Args:
md_text (str):
The Markdown-formatted text to convert into HTML.
font (str, optional):
The Google Font to apply to the exported HTML.
Defaults to "Inter". If empty or None, "Inter" is used automatically.
Returns:
str:
The file path of the generated HTML file, which can be returned
directly to Gradio for download.
"""
# If no font was provided
if not font or font.strip() == "":
font = "Inter"
# Convert markdown to HTML
html_body = markdown.markdown(md_text)
# Google Font URL (spaces replaced with +)
font_url = font.replace(" ", "+")
# Build final HTML
final_html = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Notes Export</title>
<link href="https://fonts.googleapis.com/css2?family={font_url}:wght@300;400;600&display=swap" rel="stylesheet">
<style>
body {{
font-family: '{font}', sans-serif;
max-width: 800px;
margin: 40px auto;
padding: 20px;
line-height: 1.6;
color: #222;
}}
h1, h2, h3 {{
font-weight: 600;
}}
ul {{
margin-left: 20px;
}}
</style>
</head>
<body>
{html_body}
</body>
</html>
""".strip()
# Save file
output_dir = "outputs"
Path(output_dir).mkdir(exist_ok=True)
# File path for HF Spaces
output_path = f"{output_dir}/notes_export.html"
Path(output_path).write_text(final_html, encoding="utf-8")
return output_path
notes_interface = gr.Interface(
fn=cluster_notes_dynamic,
inputs=gr.Textbox(
label="Enter notes (one per line)",
placeholder="Need to call my brother\nSend email\nResearch project",
lines=5
),
outputs=gr.Textbox(label="Clustered Output", lines=20),
api_name="cluster_notes_dynamic"
)
markdown_interface = gr.Interface(
fn=convert_structure_to_markdown,
inputs=gr.Textbox(label="Clustered input"),
outputs=gr.Textbox(label="Markdown output",lines=20),
api_name="convert_structure_to_markdown"
)
html_interface = gr.Interface(
fn=generate_minimal_google_font_html,
inputs=[
gr.Textbox(label="Markdown Input", lines=12, placeholder="# Your Markdown here..."),
gr.Textbox(label="Font (optional)", placeholder="Inter (default)")
],
outputs=gr.File(label="Download HTML"),
title="Markdown β†’ Styled HTML Converter",
description="Converts markdown into a clean HTML file styled with Google Fonts."
)
with gr.Blocks(title="NeatNote: A smart note-clustering MCP server") as demo:
gr.Markdown("""
## πŸ—‚ NeatNote transforms unstructured text into clear, organized insights using semantic embeddings and LLM summaries.
""")
gr.TabbedInterface(
[
notes_interface,
markdown_interface,
html_interface
# Add more tools here
],
[
"notes_interface",
"markdown_interface",
"html_interface"
# Add more tool tabs here
]
)
gr.Markdown("""
# πŸš€ Workflow: From Raw Notes β†’ Clusters β†’ Markdown β†’ Beautiful HTML
1. **Cluster Notes:** Paste your raw notes and let AI organize them.
2. **Convert to Markdown:** Turn the structured clusters into clean Markdown.
3. **Export as HTML:** Convert Markdown into a polished, shareable HTML file.
""")
gr.Markdown("""
# πŸŽ₯ [Watch the Demo on YouTube](https://youtu.be/6d8Drbmn0-s?si=fk716CYMBnIpLKME)
""")
gr.Markdown("""
# **Example prompt to use on claude**: Convert the attached file messy notes into beautiful structured notes in Inconsolata font html file
""")
if __name__ == "__main__":
demo.launch(mcp_server=True)