Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import hdbscan | |
| import openai | |
| from openai import OpenAI | |
| from sentence_transformers import SentenceTransformer | |
| import umap | |
| import ast | |
| import markdown | |
| from pathlib import Path | |
| import gradio as gr | |
| # --------------------------------------------------------- | |
| # Load API key | |
| # --------------------------------------------------------- | |
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| # --------------------------------------------------------- | |
| # Input loader: handles raw text, file path, or uploaded file | |
| # --------------------------------------------------------- | |
| def load_input(source): | |
| # Path string β load file | |
| if isinstance(source, str) and os.path.isfile(source): | |
| with open(source, "r", encoding="utf-8") as f: | |
| return f.read() | |
| # Raw text | |
| if isinstance(source, str): | |
| return source | |
| # Uploaded file (Colab or Claude) | |
| if hasattr(source, "read"): | |
| return source.read().decode("utf-8") | |
| raise ValueError("Unsupported input type. Pass text, file path, or uploaded file.") | |
| # --------------------------------------------------------- | |
| # Expand notes for better embedding semantic separation | |
| # --------------------------------------------------------- | |
| def expand_note(note: str) -> str: | |
| return ( | |
| f"This note says: '{note}'. " | |
| "Interpret it as a possible work task, personal task, reminder, idea, or question. " | |
| "Expand the hidden meaning so semantic embeddings become more distinguishable." | |
| ) | |
| # --------------------------------------------------------- | |
| # Clustering with UMAP + tuned HDBSCAN | |
| # --------------------------------------------------------- | |
| def cluster_embeddings(expanded_notes): | |
| n = len(expanded_notes) | |
| # If only 1 note β trivial cluster | |
| if n == 1: | |
| return {-1: [0]} | |
| model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| embeddings = model.encode(expanded_notes) | |
| # Dimensionality reduction for cleaner clusters | |
| reducer = umap.UMAP( | |
| n_neighbors=5, | |
| min_dist=0.1, | |
| metric="cosine" | |
| ) | |
| reduced = reducer.fit_transform(embeddings) | |
| # Stronger clustering behavior | |
| clusterer = hdbscan.HDBSCAN( | |
| min_cluster_size=2, | |
| min_samples=1, | |
| cluster_selection_epsilon=0.2, | |
| metric='euclidean' | |
| ) | |
| labels = clusterer.fit_predict(reduced) | |
| clusters = {} | |
| for idx, label in enumerate(labels): | |
| clusters.setdefault(int(label), []).append(idx) | |
| return clusters | |
| # --------------------------------------------------------- | |
| # LLM cluster summarizer β always returns valid JSON | |
| # --------------------------------------------------------- | |
| def summarize_cluster_with_llm(raw_items): | |
| prompt = f""" | |
| You must return ONLY valid JSON. | |
| No markdown. No backticks. No explanations. | |
| Notes: | |
| {raw_items} | |
| Return JSON exactly like: | |
| {{ | |
| "title": "...", | |
| "summary": "..." | |
| }} | |
| """ | |
| response = client.chat.completions.create( | |
| model="gpt-4.1-mini", | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.2, | |
| ) | |
| json_text = response.choices[0].message.content.strip() | |
| # Try direct JSON parse | |
| try: | |
| return json.loads(json_text) | |
| except: | |
| # Fallback: remove accidental formatting | |
| cleaned = ( | |
| json_text.replace("```", "") | |
| .replace("json", "") | |
| .strip() | |
| ) | |
| return json.loads(cleaned) | |
| # --------------------------------------------------------- | |
| # MAIN TOOL β Works with text OR file path OR upload | |
| # --------------------------------------------------------- | |
| def cluster_notes_dynamic(input_data) -> dict: | |
| """ | |
| Dynamically clusters unstructured notes into semantic groups using embeddings and HDBSCAN, | |
| then summarizes each cluster with an LLM. | |
| Args: | |
| input_data (str or file-like): | |
| - Raw multiline text containing notes, OR | |
| - A file path to a text file, OR | |
| - A file-like object uploaded in environments such as Colab or Gradio. | |
| Returns: | |
| dict: A JSON-like dictionary structure: | |
| { | |
| "clusters": [ | |
| { | |
| "id": <cluster_id>, | |
| "items": [list of notes], | |
| "analysis": { | |
| "title": "...", | |
| "summary": "..." | |
| } | |
| }, | |
| ... | |
| ] | |
| } | |
| Behavior: | |
| - Automatically detects if input is text or file path. | |
| - Expands each note for better semantic embedding separation. | |
| - Uses SentenceTransformer embeddings + HDBSCAN for density-based clustering. | |
| - Uses an LLM to generate a clean title and summary for each cluster. | |
| - Returns strictly structured output for downstream formatting tools. | |
| """ | |
| text = load_input(input_data) | |
| # Parse text into notes | |
| raw_notes = [l.strip() for l in text.split("\n") if l.strip()] | |
| if not raw_notes: | |
| return {"clusters": []} | |
| expanded_notes = [expand_note(n) for n in raw_notes] | |
| cluster_map = cluster_embeddings(expanded_notes) | |
| results = [] | |
| for cid, idx_list in cluster_map.items(): | |
| items = [raw_notes[i] for i in idx_list] | |
| analysis = summarize_cluster_with_llm(items) | |
| results.append({ | |
| "id": cid, | |
| "items": items, | |
| "analysis": analysis | |
| }) | |
| return {"clusters": results} | |
| def convert_structure_to_markdown(structured_json: dict | str) -> str: | |
| """ | |
| Converts a structured notes JSON object into a clean, readable Markdown document. | |
| Args: | |
| structured_json (dict | str): | |
| Either a Python dictionary or a JSON string containing clustered notes | |
| in the format produced by `cluster_notes_dynamic`. | |
| Example structure: | |
| { | |
| "clusters": [ | |
| { | |
| "id": 0, | |
| "items": [...], | |
| "analysis": {"title": "...", "summary": "..."} | |
| } | |
| ] | |
| } | |
| Returns: | |
| str: A Markdown-formatted representation of all clusters, including | |
| titles, summaries, and individual note items. | |
| """ | |
| # Convert string input into dict | |
| if isinstance(structured_json, str): | |
| try: | |
| structured_json = json.loads(structured_json) | |
| except: | |
| structured_json = ast.literal_eval(structured_json) | |
| md = "# π Structured Notes\n\n" | |
| for cluster in structured_json["clusters"]: | |
| title = cluster["analysis"]["title"] | |
| summary = cluster["analysis"]["summary"] | |
| items = cluster["items"] | |
| md += f"## {title}\n" | |
| md += f"{summary}\n\n" | |
| md += "### Notes:\n" | |
| for item in items: | |
| md += f"- {item}\n" | |
| md += "\n" | |
| return md | |
| def generate_minimal_google_font_html(md_text: str, font: str = "Inter") -> str: | |
| """ | |
| Converts Markdown text into a simple, styled HTML document using a Google Font. | |
| Args: | |
| md_text (str): | |
| The Markdown-formatted text to convert into HTML. | |
| font (str, optional): | |
| The Google Font to apply to the exported HTML. | |
| Defaults to "Inter". If empty or None, "Inter" is used automatically. | |
| Returns: | |
| str: | |
| The file path of the generated HTML file, which can be returned | |
| directly to Gradio for download. | |
| """ | |
| # If no font was provided | |
| if not font or font.strip() == "": | |
| font = "Inter" | |
| # Convert markdown to HTML | |
| html_body = markdown.markdown(md_text) | |
| # Google Font URL (spaces replaced with +) | |
| font_url = font.replace(" ", "+") | |
| # Build final HTML | |
| final_html = f""" | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <meta charset="utf-8"> | |
| <title>Notes Export</title> | |
| <link href="https://fonts.googleapis.com/css2?family={font_url}:wght@300;400;600&display=swap" rel="stylesheet"> | |
| <style> | |
| body {{ | |
| font-family: '{font}', sans-serif; | |
| max-width: 800px; | |
| margin: 40px auto; | |
| padding: 20px; | |
| line-height: 1.6; | |
| color: #222; | |
| }} | |
| h1, h2, h3 {{ | |
| font-weight: 600; | |
| }} | |
| ul {{ | |
| margin-left: 20px; | |
| }} | |
| </style> | |
| </head> | |
| <body> | |
| {html_body} | |
| </body> | |
| </html> | |
| """.strip() | |
| # Save file | |
| output_dir = "outputs" | |
| Path(output_dir).mkdir(exist_ok=True) | |
| # File path for HF Spaces | |
| output_path = f"{output_dir}/notes_export.html" | |
| Path(output_path).write_text(final_html, encoding="utf-8") | |
| return output_path | |
| notes_interface = gr.Interface( | |
| fn=cluster_notes_dynamic, | |
| inputs=gr.Textbox( | |
| label="Enter notes (one per line)", | |
| placeholder="Need to call my brother\nSend email\nResearch project", | |
| lines=5 | |
| ), | |
| outputs=gr.Textbox(label="Clustered Output", lines=20), | |
| api_name="cluster_notes_dynamic" | |
| ) | |
| markdown_interface = gr.Interface( | |
| fn=convert_structure_to_markdown, | |
| inputs=gr.Textbox(label="Clustered input"), | |
| outputs=gr.Textbox(label="Markdown output",lines=20), | |
| api_name="convert_structure_to_markdown" | |
| ) | |
| html_interface = gr.Interface( | |
| fn=generate_minimal_google_font_html, | |
| inputs=[ | |
| gr.Textbox(label="Markdown Input", lines=12, placeholder="# Your Markdown here..."), | |
| gr.Textbox(label="Font (optional)", placeholder="Inter (default)") | |
| ], | |
| outputs=gr.File(label="Download HTML"), | |
| title="Markdown β Styled HTML Converter", | |
| description="Converts markdown into a clean HTML file styled with Google Fonts." | |
| ) | |
| with gr.Blocks(title="NeatNote: A smart note-clustering MCP server") as demo: | |
| gr.Markdown(""" | |
| ## π NeatNote transforms unstructured text into clear, organized insights using semantic embeddings and LLM summaries. | |
| """) | |
| gr.TabbedInterface( | |
| [ | |
| notes_interface, | |
| markdown_interface, | |
| html_interface | |
| # Add more tools here | |
| ], | |
| [ | |
| "notes_interface", | |
| "markdown_interface", | |
| "html_interface" | |
| # Add more tool tabs here | |
| ] | |
| ) | |
| gr.Markdown(""" | |
| # π Workflow: From Raw Notes β Clusters β Markdown β Beautiful HTML | |
| 1. **Cluster Notes:** Paste your raw notes and let AI organize them. | |
| 2. **Convert to Markdown:** Turn the structured clusters into clean Markdown. | |
| 3. **Export as HTML:** Convert Markdown into a polished, shareable HTML file. | |
| """) | |
| gr.Markdown(""" | |
| # π₯ [Watch the Demo on YouTube](https://youtu.be/6d8Drbmn0-s?si=fk716CYMBnIpLKME) | |
| """) | |
| gr.Markdown(""" | |
| # **Example prompt to use on claude**: Convert the attached file messy notes into beautiful structured notes in Inconsolata font html file | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch(mcp_server=True) |