import os import json import hdbscan import openai from openai import OpenAI from sentence_transformers import SentenceTransformer import umap import ast import markdown from pathlib import Path import gradio as gr # --------------------------------------------------------- # Load API key # --------------------------------------------------------- client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) # --------------------------------------------------------- # Input loader: handles raw text, file path, or uploaded file # --------------------------------------------------------- def load_input(source): # Path string → load file if isinstance(source, str) and os.path.isfile(source): with open(source, "r", encoding="utf-8") as f: return f.read() # Raw text if isinstance(source, str): return source # Uploaded file (Colab or Claude) if hasattr(source, "read"): return source.read().decode("utf-8") raise ValueError("Unsupported input type. Pass text, file path, or uploaded file.") # --------------------------------------------------------- # Expand notes for better embedding semantic separation # --------------------------------------------------------- def expand_note(note: str) -> str: return ( f"This note says: '{note}'. " "Interpret it as a possible work task, personal task, reminder, idea, or question. " "Expand the hidden meaning so semantic embeddings become more distinguishable." ) # --------------------------------------------------------- # Clustering with UMAP + tuned HDBSCAN # --------------------------------------------------------- def cluster_embeddings(expanded_notes): n = len(expanded_notes) # If only 1 note → trivial cluster if n == 1: return {-1: [0]} model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") embeddings = model.encode(expanded_notes) # Dimensionality reduction for cleaner clusters reducer = umap.UMAP( n_neighbors=5, min_dist=0.1, metric="cosine" ) reduced = reducer.fit_transform(embeddings) # Stronger clustering behavior clusterer = hdbscan.HDBSCAN( min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0.2, metric='euclidean' ) labels = clusterer.fit_predict(reduced) clusters = {} for idx, label in enumerate(labels): clusters.setdefault(int(label), []).append(idx) return clusters # --------------------------------------------------------- # LLM cluster summarizer → always returns valid JSON # --------------------------------------------------------- def summarize_cluster_with_llm(raw_items): prompt = f""" You must return ONLY valid JSON. No markdown. No backticks. No explanations. Notes: {raw_items} Return JSON exactly like: {{ "title": "...", "summary": "..." }} """ response = client.chat.completions.create( model="gpt-4.1-mini", messages=[{"role": "user", "content": prompt}], temperature=0.2, ) json_text = response.choices[0].message.content.strip() # Try direct JSON parse try: return json.loads(json_text) except: # Fallback: remove accidental formatting cleaned = ( json_text.replace("```", "") .replace("json", "") .strip() ) return json.loads(cleaned) # --------------------------------------------------------- # MAIN TOOL — Works with text OR file path OR upload # --------------------------------------------------------- def cluster_notes_dynamic(input_data) -> dict: """ Dynamically clusters unstructured notes into semantic groups using embeddings and HDBSCAN, then summarizes each cluster with an LLM. Args: input_data (str or file-like): - Raw multiline text containing notes, OR - A file path to a text file, OR - A file-like object uploaded in environments such as Colab or Gradio. Returns: dict: A JSON-like dictionary structure: { "clusters": [ { "id": , "items": [list of notes], "analysis": { "title": "...", "summary": "..." } }, ... ] } Behavior: - Automatically detects if input is text or file path. - Expands each note for better semantic embedding separation. - Uses SentenceTransformer embeddings + HDBSCAN for density-based clustering. - Uses an LLM to generate a clean title and summary for each cluster. - Returns strictly structured output for downstream formatting tools. """ text = load_input(input_data) # Parse text into notes raw_notes = [l.strip() for l in text.split("\n") if l.strip()] if not raw_notes: return {"clusters": []} expanded_notes = [expand_note(n) for n in raw_notes] cluster_map = cluster_embeddings(expanded_notes) results = [] for cid, idx_list in cluster_map.items(): items = [raw_notes[i] for i in idx_list] analysis = summarize_cluster_with_llm(items) results.append({ "id": cid, "items": items, "analysis": analysis }) return {"clusters": results} def convert_structure_to_markdown(structured_json: dict | str) -> str: """ Converts a structured notes JSON object into a clean, readable Markdown document. Args: structured_json (dict | str): Either a Python dictionary or a JSON string containing clustered notes in the format produced by `cluster_notes_dynamic`. Example structure: { "clusters": [ { "id": 0, "items": [...], "analysis": {"title": "...", "summary": "..."} } ] } Returns: str: A Markdown-formatted representation of all clusters, including titles, summaries, and individual note items. """ # Convert string input into dict if isinstance(structured_json, str): try: structured_json = json.loads(structured_json) except: structured_json = ast.literal_eval(structured_json) md = "# 🗂 Structured Notes\n\n" for cluster in structured_json["clusters"]: title = cluster["analysis"]["title"] summary = cluster["analysis"]["summary"] items = cluster["items"] md += f"## {title}\n" md += f"{summary}\n\n" md += "### Notes:\n" for item in items: md += f"- {item}\n" md += "\n" return md def generate_minimal_google_font_html(md_text: str, font: str = "Inter") -> str: """ Converts Markdown text into a simple, styled HTML document using a Google Font. Args: md_text (str): The Markdown-formatted text to convert into HTML. font (str, optional): The Google Font to apply to the exported HTML. Defaults to "Inter". If empty or None, "Inter" is used automatically. Returns: str: The file path of the generated HTML file, which can be returned directly to Gradio for download. """ # If no font was provided if not font or font.strip() == "": font = "Inter" # Convert markdown to HTML html_body = markdown.markdown(md_text) # Google Font URL (spaces replaced with +) font_url = font.replace(" ", "+") # Build final HTML final_html = f""" Notes Export {html_body} """.strip() # Save file output_dir = "outputs" Path(output_dir).mkdir(exist_ok=True) # File path for HF Spaces output_path = f"{output_dir}/notes_export.html" Path(output_path).write_text(final_html, encoding="utf-8") return output_path notes_interface = gr.Interface( fn=cluster_notes_dynamic, inputs=gr.Textbox( label="Enter notes (one per line)", placeholder="Need to call my brother\nSend email\nResearch project", lines=5 ), outputs=gr.Textbox(label="Clustered Output", lines=20), api_name="cluster_notes_dynamic" ) markdown_interface = gr.Interface( fn=convert_structure_to_markdown, inputs=gr.Textbox(label="Clustered input"), outputs=gr.Textbox(label="Markdown output",lines=20), api_name="convert_structure_to_markdown" ) html_interface = gr.Interface( fn=generate_minimal_google_font_html, inputs=[ gr.Textbox(label="Markdown Input", lines=12, placeholder="# Your Markdown here..."), gr.Textbox(label="Font (optional)", placeholder="Inter (default)") ], outputs=gr.File(label="Download HTML"), title="Markdown → Styled HTML Converter", description="Converts markdown into a clean HTML file styled with Google Fonts." ) with gr.Blocks(title="NeatNote: A smart note-clustering MCP server") as demo: gr.Markdown(""" ## 🗂 NeatNote transforms unstructured text into clear, organized insights using semantic embeddings and LLM summaries. """) gr.TabbedInterface( [ notes_interface, markdown_interface, html_interface # Add more tools here ], [ "notes_interface", "markdown_interface", "html_interface" # Add more tool tabs here ] ) gr.Markdown(""" # 🚀 Workflow: From Raw Notes → Clusters → Markdown → Beautiful HTML 1. **Cluster Notes:** Paste your raw notes and let AI organize them. 2. **Convert to Markdown:** Turn the structured clusters into clean Markdown. 3. **Export as HTML:** Convert Markdown into a polished, shareable HTML file. """) gr.Markdown(""" # 🎥 [Watch the Demo on YouTube](https://youtu.be/6d8Drbmn0-s?si=fk716CYMBnIpLKME) """) gr.Markdown(""" # **Example prompt to use on claude**: Convert the attached file messy notes into beautiful structured notes in Inconsolata font html file """) if __name__ == "__main__": demo.launch(mcp_server=True)