Spaces:

MCP-1st-Birthday
/

NeatNote

Sleeping

App Files Files Community

nharshavardhana commited on Nov 19, 2025

Commit

db2dba7

1 Parent(s): 73a67c7

commit

Browse files

Files changed (2) hide show

app.py +389 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,389 @@

+import os
+import json
+import hdbscan
+import openai
+from openai import OpenAI
+from sentence_transformers import SentenceTransformer
+import umap
+import ast
+import markdown
+from pathlib import Path
+import gradio as gr
+# ---------------------------------------------------------
+# Load API key
+# ---------------------------------------------------------
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+# ---------------------------------------------------------
+# Input loader: handles raw text, file path, or uploaded file
+# ---------------------------------------------------------
+def load_input(source):
+    # Path string → load file
+    if isinstance(source, str) and os.path.isfile(source):
+        with open(source, "r", encoding="utf-8") as f:
+            return f.read()
+    # Raw text
+    if isinstance(source, str):
+        return source
+    # Uploaded file (Colab or Claude)
+    if hasattr(source, "read"):
+        return source.read().decode("utf-8")
+    raise ValueError("Unsupported input type. Pass text, file path, or uploaded file.")
+# ---------------------------------------------------------
+# Expand notes for better embedding semantic separation
+# ---------------------------------------------------------
+def expand_note(note: str) -> str:
+    return (
+        f"This note says: '{note}'. "
+        "Interpret it as a possible work task, personal task, reminder, idea, or question. "
+        "Expand the hidden meaning so semantic embeddings become more distinguishable."
+    )
+# ---------------------------------------------------------
+# Clustering with UMAP + tuned HDBSCAN
+# ---------------------------------------------------------
+def cluster_embeddings(expanded_notes):
+    n = len(expanded_notes)
+    # If only 1 note → trivial cluster
+    if n == 1:
+        return {-1: [0]}
+    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    embeddings = model.encode(expanded_notes)
+    # Dimensionality reduction for cleaner clusters
+    reducer = umap.UMAP(
+        n_neighbors=5,
+        min_dist=0.1,
+        metric="cosine"
+    )
+    reduced = reducer.fit_transform(embeddings)
+    # Stronger clustering behavior
+    clusterer = hdbscan.HDBSCAN(
+        min_cluster_size=2,
+        min_samples=1,
+        cluster_selection_epsilon=0.2,
+        metric='euclidean'
+    )
+    labels = clusterer.fit_predict(reduced)
+    clusters = {}
+    for idx, label in enumerate(labels):
+        clusters.setdefault(int(label), []).append(idx)
+    return clusters
+# ---------------------------------------------------------
+# LLM cluster summarizer → always returns valid JSON
+# ---------------------------------------------------------
+def summarize_cluster_with_llm(raw_items):
+    prompt = f"""
+You must return ONLY valid JSON.
+No markdown. No backticks. No explanations.
+Notes:
+{raw_items}
+Return JSON exactly like:
+{{
+  "title": "...",
+  "summary": "..."
+}}
+"""
+    response = client.chat.completions.create(
+        model="gpt-4.1-mini",
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.2,
+    )
+    json_text = response.choices[0].message.content.strip()
+    # Try direct JSON parse
+    try:
+        return json.loads(json_text)
+    except:
+        # Fallback: remove accidental formatting
+        cleaned = (
+            json_text.replace("```", "")
+            .replace("json", "")
+            .strip()
+        )
+        return json.loads(cleaned)
+# ---------------------------------------------------------
+# MAIN TOOL — Works with text OR file path OR upload
+# ---------------------------------------------------------
+def cluster_notes_dynamic(input_data) -> dict:
+    """
+    Dynamically clusters unstructured notes into semantic groups using embeddings and HDBSCAN,
+    then summarizes each cluster with an LLM.
+    Args:
+        input_data (str or file-like):
+            - Raw multiline text containing notes, OR
+            - A file path to a text file, OR
+            - A file-like object uploaded in environments such as Colab or Gradio.
+    Returns:
+        dict: A JSON-like dictionary structure:
+        {
+            "clusters": [
+                {
+                    "id": <cluster_id>,
+                    "items": [list of notes],
+                    "analysis": {
+                        "title": "...",
+                        "summary": "..."
+                    }
+                },
+                ...
+            ]
+        }
+    Behavior:
+        - Automatically detects if input is text or file path.
+        - Expands each note for better semantic embedding separation.
+        - Uses SentenceTransformer embeddings + HDBSCAN for density-based clustering.
+        - Uses an LLM to generate a clean title and summary for each cluster.
+        - Returns strictly structured output for downstream formatting tools.
+    """
+    text = load_input(input_data)
+    # Parse text into notes
+    raw_notes = [l.strip() for l in text.split("\n") if l.strip()]
+    if not raw_notes:
+        return {"clusters": []}
+    expanded_notes = [expand_note(n) for n in raw_notes]
+    cluster_map = cluster_embeddings(expanded_notes)
+    results = []
+    for cid, idx_list in cluster_map.items():
+        items = [raw_notes[i] for i in idx_list]
+        analysis = summarize_cluster_with_llm(items)
+        results.append({
+            "id": cid,
+            "items": items,
+            "analysis": analysis
+        })
+    return {"clusters": results}
+def convert_structure_to_markdown(structured_json: dict | str) -> str:
+    """
+    Converts a structured notes JSON object into a clean, readable Markdown document.
+    Args:
+        structured_json (dict | str):
+            Either a Python dictionary or a JSON string containing clustered notes
+            in the format produced by `cluster_notes_dynamic`.
+            Example structure:
+            {
+                "clusters": [
+                    {
+                        "id": 0,
+                        "items": [...],
+                        "analysis": {"title": "...", "summary": "..."}
+                    }
+                ]
+            }
+    Returns:
+        str: A Markdown-formatted representation of all clusters, including
+             titles, summaries, and individual note items.
+    """
+    # Convert string input into dict
+    if isinstance(structured_json, str):
+        try:
+            structured_json = json.loads(structured_json)
+        except:
+            structured_json = ast.literal_eval(structured_json)
+    md = "# 🗂 Structured Notes\n\n"
+    for cluster in structured_json["clusters"]:
+        title = cluster["analysis"]["title"]
+        summary = cluster["analysis"]["summary"]
+        items = cluster["items"]
+        md += f"## {title}\n"
+        md += f"{summary}\n\n"
+        md += "### Notes:\n"
+        for item in items:
+            md += f"- {item}\n"
+        md += "\n"
+    return md
+def generate_minimal_google_font_html(md_text: str, font: str = "Inter") -> str:
+    """
+    Converts Markdown text into a simple, styled HTML document using a Google Font.
+    Args:
+        md_text (str):
+            The Markdown-formatted text to convert into HTML.
+        font (str, optional):
+            The Google Font to apply to the exported HTML.
+            Defaults to "Inter". If empty or None, "Inter" is used automatically.
+    Returns:
+        str:
+            The file path of the generated HTML file, which can be returned
+            directly to Gradio for download.
+    """
+    # If no font was provided
+    if not font or font.strip() == "":
+        font = "Inter"
+    # Convert markdown to HTML
+    html_body = markdown.markdown(md_text)
+    # Google Font URL (spaces replaced with +)
+    font_url = font.replace(" ", "+")
+    # Build final HTML
+    final_html = f"""
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <title>Notes Export</title>
+    <link href="https://fonts.googleapis.com/css2?family={font_url}:wght@300;400;600&display=swap" rel="stylesheet">
+    <style>
+        body {{
+            font-family: '{font}', sans-serif;
+            max-width: 800px;
+            margin: 40px auto;
+            padding: 20px;
+            line-height: 1.6;
+            color: #222;
+        }}
+        h1, h2, h3 {{
+            font-weight: 600;
+        }}
+        ul {{
+            margin-left: 20px;
+        }}
+    </style>
+</head>
+<body>
+{html_body}
+</body>
+</html>
+""".strip()
+    # Save file
+    output_path = "/content/notes_export.html"
+    Path(output_path).write_text(final_html, encoding="utf-8")
+    return output_path
+def cluster_notes_entry(text_input, file_input):
+    """
+    Wrapper function so Gradio can pass either text OR file.
+    """
+    # 1. If a file was uploaded
+    if file_input:
+        try:
+            # file_input is a temporary file path string
+            with open(file_input, "r", encoding="utf-8") as f:
+                content = f.read()
+            return cluster_notes_dynamic(content)
+        except Exception as e:
+            return f"Error reading file: {e}"
+    # 2. If raw text was entered
+    if text_input and text_input.strip():
+        return cluster_notes_dynamic(text_input)
+    return "Please enter text or upload a file."
+notes_interface = gr.Interface(
+    fn=cluster_notes_entry,
+    inputs=[
+        gr.Textbox(
+            label="Enter notes (one per line)",
+            placeholder="Need to call my brother\nSend email\nResearch project",
+            lines=5
+        ),
+        gr.File(
+            label="Upload notes file (.txt)",
+            file_types=["text"]
+        )
+    ],
+    outputs=gr.Textbox(label="Clustered Output", lines=20),
+    api_name="cluster_notes_dynamic"
+)
+markdown_interface = gr.Interface(
+    fn=convert_structure_to_markdown,
+    inputs=gr.Textbox(label="Clustered input"),
+    outputs=gr.Textbox(label="Markdown output",lines=20),
+    api_name="convert_structure_to_markdown"
+)
+html_interface = gr.Interface(
+    fn=generate_minimal_google_font_html,
+    inputs=[
+        gr.Textbox(label="Markdown Input", lines=12, placeholder="# Your Markdown here..."),
+        gr.Textbox(label="Google Font (optional)", placeholder="Inter (default)")
+    ],
+    outputs=gr.File(label="Download HTML"),
+    title="Markdown → Styled HTML Converter",
+    description="Converts markdown into a clean HTML file styled with Google Fonts."
+)
+with gr.Blocks(title="NeatNote: A smart note-clustering MCP server that transforms unstructured text into clear, organized insights using semantic embeddings and LLM summaries.") as demo:
+    gr.Markdown("""
+        # NeatNote 🗂
+    """)
+    gr.TabbedInterface(
+        [
+            notes_interface,
+            markdown_interface,
+            html_interface
+            # Add more tools here
+        ],
+        [
+            "notes_interface",
+            "markdown_interface",
+            "html_interface"
+            # Add more tool tabs here
+        ]
+    )
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+openai
+sentence-transformers
+hdbscan
+umap-learn
+markdown
+gradio
+numpy
+scikit-learn
+scipy
+pandas