Spaces:

MCP-1st-Birthday
/

NeatNote

Sleeping

App Files Files Community

NeatNote / app.py

nharshavardhana

commit

b684c99 2 months ago

raw

history blame contribute delete

10.9 kB

	import os
	import json
	import hdbscan
	import openai
	from openai import OpenAI
	from sentence_transformers import SentenceTransformer
	import umap
	import ast
	import markdown
	from pathlib import Path
	import gradio as gr


	# ---------------------------------------------------------
	# Load API key
	# ---------------------------------------------------------
	client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


	# ---------------------------------------------------------
	# Input loader: handles raw text, file path, or uploaded file
	# ---------------------------------------------------------
	def load_input(source):
	# Path string → load file
	if isinstance(source, str) and os.path.isfile(source):
	with open(source, "r", encoding="utf-8") as f:
	return f.read()

	# Raw text
	if isinstance(source, str):
	return source

	# Uploaded file (Colab or Claude)
	if hasattr(source, "read"):
	return source.read().decode("utf-8")

	raise ValueError("Unsupported input type. Pass text, file path, or uploaded file.")


	# ---------------------------------------------------------
	# Expand notes for better embedding semantic separation
	# ---------------------------------------------------------
	def expand_note(note: str) -> str:
	return (
	f"This note says: '{note}'. "
	"Interpret it as a possible work task, personal task, reminder, idea, or question. "
	"Expand the hidden meaning so semantic embeddings become more distinguishable."
	)


	# ---------------------------------------------------------
	# Clustering with UMAP + tuned HDBSCAN
	# ---------------------------------------------------------
	def cluster_embeddings(expanded_notes):
	n = len(expanded_notes)

	# If only 1 note → trivial cluster
	if n == 1:
	return {-1: [0]}

	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	embeddings = model.encode(expanded_notes)

	# Dimensionality reduction for cleaner clusters
	reducer = umap.UMAP(
	n_neighbors=5,
	min_dist=0.1,
	metric="cosine"
	)
	reduced = reducer.fit_transform(embeddings)

	# Stronger clustering behavior
	clusterer = hdbscan.HDBSCAN(
	min_cluster_size=2,
	min_samples=1,
	cluster_selection_epsilon=0.2,
	metric='euclidean'
	)

	labels = clusterer.fit_predict(reduced)

	clusters = {}
	for idx, label in enumerate(labels):
	clusters.setdefault(int(label), []).append(idx)

	return clusters


	# ---------------------------------------------------------
	# LLM cluster summarizer → always returns valid JSON
	# ---------------------------------------------------------
	def summarize_cluster_with_llm(raw_items):
	prompt = f"""
	You must return ONLY valid JSON.
	No markdown. No backticks. No explanations.

	Notes:
	{raw_items}

	Return JSON exactly like:
	{{
	"title": "...",
	"summary": "..."
	}}
	"""

	response = client.chat.completions.create(
	model="gpt-4.1-mini",
	messages=[{"role": "user", "content": prompt}],
	temperature=0.2,
	)

	json_text = response.choices[0].message.content.strip()

	# Try direct JSON parse
	try:
	return json.loads(json_text)
	except:
	# Fallback: remove accidental formatting
	cleaned = (
	json_text.replace("```", "")
	.replace("json", "")
	.strip()
	)
	return json.loads(cleaned)


	# ---------------------------------------------------------
	# MAIN TOOL — Works with text OR file path OR upload
	# ---------------------------------------------------------
	def cluster_notes_dynamic(input_data) -> dict:
	"""
	Dynamically clusters unstructured notes into semantic groups using embeddings and HDBSCAN,
	then summarizes each cluster with an LLM.

	Args:
	input_data (str or file-like):
	- Raw multiline text containing notes, OR
	- A file path to a text file, OR
	- A file-like object uploaded in environments such as Colab or Gradio.

	Returns:
	dict: A JSON-like dictionary structure:
	{
	"clusters": [
	{
	"id": <cluster_id>,
	"items": [list of notes],
	"analysis": {
	"title": "...",
	"summary": "..."
	}
	},
	...
	]
	}

	Behavior:
	- Automatically detects if input is text or file path.
	- Expands each note for better semantic embedding separation.
	- Uses SentenceTransformer embeddings + HDBSCAN for density-based clustering.
	- Uses an LLM to generate a clean title and summary for each cluster.
	- Returns strictly structured output for downstream formatting tools.

	"""

	text = load_input(input_data)

	# Parse text into notes
	raw_notes = [l.strip() for l in text.split("\n") if l.strip()]

	if not raw_notes:
	return {"clusters": []}

	expanded_notes = [expand_note(n) for n in raw_notes]
	cluster_map = cluster_embeddings(expanded_notes)

	results = []
	for cid, idx_list in cluster_map.items():
	items = [raw_notes[i] for i in idx_list]
	analysis = summarize_cluster_with_llm(items)
	results.append({
	"id": cid,
	"items": items,
	"analysis": analysis
	})

	return {"clusters": results}



	def convert_structure_to_markdown(structured_json: dict \| str) -> str:
	"""
	Converts a structured notes JSON object into a clean, readable Markdown document.

	Args:
	structured_json (dict \| str):
	Either a Python dictionary or a JSON string containing clustered notes
	in the format produced by `cluster_notes_dynamic`.
	Example structure:
	{
	"clusters": [
	{
	"id": 0,
	"items": [...],
	"analysis": {"title": "...", "summary": "..."}
	}
	]
	}

	Returns:
	str: A Markdown-formatted representation of all clusters, including
	titles, summaries, and individual note items.
	"""

	# Convert string input into dict
	if isinstance(structured_json, str):
	try:
	structured_json = json.loads(structured_json)
	except:
	structured_json = ast.literal_eval(structured_json)

	md = "# 🗂 Structured Notes\n\n"

	for cluster in structured_json["clusters"]:
	title = cluster["analysis"]["title"]
	summary = cluster["analysis"]["summary"]
	items = cluster["items"]

	md += f"## {title}\n"
	md += f"{summary}\n\n"
	md += "### Notes:\n"
	for item in items:
	md += f"- {item}\n"
	md += "\n"

	return md



	def generate_minimal_google_font_html(md_text: str, font: str = "Inter") -> str:
	"""
	Converts Markdown text into a simple, styled HTML document using a Google Font.

	Args:
	md_text (str):
	The Markdown-formatted text to convert into HTML.
	font (str, optional):
	The Google Font to apply to the exported HTML.
	Defaults to "Inter". If empty or None, "Inter" is used automatically.

	Returns:
	str:
	The file path of the generated HTML file, which can be returned
	directly to Gradio for download.
	"""

	# If no font was provided
	if not font or font.strip() == "":
	font = "Inter"

	# Convert markdown to HTML
	html_body = markdown.markdown(md_text)

	# Google Font URL (spaces replaced with +)
	font_url = font.replace(" ", "+")

	# Build final HTML
	final_html = f"""
	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="utf-8">
	<title>Notes Export</title>
	<link href="https://fonts.googleapis.com/css2?family={font_url}:wght@300;400;600&display=swap" rel="stylesheet">
	<style>
	body {{
	font-family: '{font}', sans-serif;
	max-width: 800px;
	margin: 40px auto;
	padding: 20px;
	line-height: 1.6;
	color: #222;
	}}
	h1, h2, h3 {{
	font-weight: 600;
	}}
	ul {{
	margin-left: 20px;
	}}
	</style>
	</head>
	<body>
	{html_body}
	</body>
	</html>
	""".strip()

	# Save file
	output_dir = "outputs"
	Path(output_dir).mkdir(exist_ok=True)

	# File path for HF Spaces
	output_path = f"{output_dir}/notes_export.html"

	Path(output_path).write_text(final_html, encoding="utf-8")

	return output_path



	notes_interface = gr.Interface(
	fn=cluster_notes_dynamic,
	inputs=gr.Textbox(
	label="Enter notes (one per line)",
	placeholder="Need to call my brother\nSend email\nResearch project",
	lines=5
	),
	outputs=gr.Textbox(label="Clustered Output", lines=20),
	api_name="cluster_notes_dynamic"
	)

	markdown_interface = gr.Interface(
	fn=convert_structure_to_markdown,
	inputs=gr.Textbox(label="Clustered input"),
	outputs=gr.Textbox(label="Markdown output",lines=20),
	api_name="convert_structure_to_markdown"
	)

	html_interface = gr.Interface(
	fn=generate_minimal_google_font_html,
	inputs=[
	gr.Textbox(label="Markdown Input", lines=12, placeholder="# Your Markdown here..."),
	gr.Textbox(label="Font (optional)", placeholder="Inter (default)")
	],
	outputs=gr.File(label="Download HTML"),
	title="Markdown → Styled HTML Converter",
	description="Converts markdown into a clean HTML file styled with Google Fonts."
	)



	with gr.Blocks(title="NeatNote: A smart note-clustering MCP server") as demo:
	gr.Markdown("""
	## 🗂 NeatNote transforms unstructured text into clear, organized insights using semantic embeddings and LLM summaries.
	""")

	gr.TabbedInterface(
	[
	notes_interface,
	markdown_interface,
	html_interface
	# Add more tools here
	],
	[
	"notes_interface",
	"markdown_interface",
	"html_interface"
	# Add more tool tabs here

	]

	)
	gr.Markdown("""
	# 🚀 Workflow: From Raw Notes → Clusters → Markdown → Beautiful HTML
	1. Cluster Notes: Paste your raw notes and let AI organize them.
	2. Convert to Markdown: Turn the structured clusters into clean Markdown.
	3. Export as HTML: Convert Markdown into a polished, shareable HTML file.
	""")
	gr.Markdown("""
	# 🎥 [Watch the Demo on YouTube](https://youtu.be/6d8Drbmn0-s?si=fk716CYMBnIpLKME)
	""")
	gr.Markdown("""
	# Example prompt to use on claude: Convert the attached file messy notes into beautiful structured notes in Inconsolata font html file
	""")





	if __name__ == "__main__":
	demo.launch(mcp_server=True)