mackenzietechdocs's picture
adding app content and source files
6822668
# server_docs.py
from __future__ import annotations
from pathlib import Path
from typing import List, Dict, Any
from mcp.server.fastmcp import FastMCP
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
from document_intelligence import DocumentIntelligence
# Import PDF processing library
try:
import PyPDF2
PDF_SUPPORT = True
except ImportError:
PDF_SUPPORT = False
print("Warning: PyPDF2 not installed. PDF support disabled.")
# Name your server – this is what clients see
mcp = FastMCP("DocsNavigator")
DOCS_ROOT = Path(__file__).parent.parent.parent / "docs"
doc_intel = DocumentIntelligence(DOCS_ROOT)
def _iter_docs() -> list[Path]:
exts = {".md", ".txt", ".rst"}
if PDF_SUPPORT:
exts.add(".pdf")
return [
p for p in DOCS_ROOT.rglob("*")
if p.is_file() and p.suffix.lower() in exts
]
def _read_file(path: Path) -> str:
if path.suffix.lower() == ".pdf":
return _read_pdf_file(path)
else:
return path.read_text(encoding="utf-8", errors="ignore")
def _read_pdf_file(path: Path) -> str:
"""Extract text from PDF file."""
if not PDF_SUPPORT:
return f"PDF support not available. Install PyPDF2 to read {path.name}"
try:
text = ""
with open(path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num, page in enumerate(pdf_reader.pages):
try:
page_text = page.extract_text()
if page_text:
text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
except Exception as e:
text += f"\n--- Page {page_num + 1} (Error reading: {str(e)}) ---\n"
return text if text.strip() else f"No text could be extracted from {path.name}"
except Exception as e:
return f"Error reading PDF {path.name}: {str(e)}"
def _extract_hierarchical_sections(content: str) -> List[Dict[str, str]]:
"""Extract sections including their subsections for better content access."""
lines = content.split('\n')
headers = []
# Identify all headers
for i, line in enumerate(lines):
stripped = line.strip()
if stripped.startswith('#'):
level = len(stripped) - len(stripped.lstrip('#'))
title = stripped.lstrip('#').strip()
headers.append({
'title': stripped,
'clean_title': title,
'level': level,
'line_index': i
})
if not headers:
return [{'title': 'Document Content', 'content': content.strip()}]
hierarchical_sections = []
# Extract content for each header including subsections
for i, header in enumerate(headers):
start_line = header['line_index']
# Find content that belongs to this section (including subsections)
end_line = len(lines)
for j in range(i + 1, len(headers)):
next_header = headers[j]
# Only stop at headers of the same or higher level (lower number)
if next_header['level'] <= header['level']:
end_line = next_header['line_index']
break
# Extract all content for this section (header + content + subsections)
section_lines = lines[start_line:end_line]
section_content = '\n'.join(section_lines).strip()
# Remove the header line itself from content for cleaner output
if section_content.startswith('#'):
content_lines = section_content.split('\n')[1:]
clean_content = '\n'.join(content_lines).strip()
else:
clean_content = section_content
hierarchical_sections.append({
'title': header['title'],
'content': clean_content,
'level': header['level'],
'includes_subsections': any(h['level'] > header['level'] for h in headers[i+1:] if h['line_index'] < end_line)
})
return hierarchical_sections
def _extract_sections(content: str) -> List[Dict[str, str]]:
"""Extract sections from markdown content based on headers with proper hierarchy."""
lines = content.split('\n')
headers = []
# First pass: identify all headers with their positions
for i, line in enumerate(lines):
stripped = line.strip()
if stripped.startswith('#'):
level = len(stripped) - len(stripped.lstrip('#'))
title = stripped.lstrip('#').strip()
headers.append({
'title': stripped,
'clean_title': title,
'level': level,
'line_index': i
})
if not headers:
return [{'title': 'Document Content', 'content': content.strip()}]
sections = []
# Second pass: extract content for each header
for i, header in enumerate(headers):
start_line = header['line_index'] + 1
# Find the end of this section (next header of same or higher level)
end_line = len(lines)
for j in range(i + 1, len(headers)):
next_header = headers[j]
if next_header['level'] <= header['level']:
end_line = next_header['line_index']
break
# Extract content for this section
section_lines = lines[start_line:end_line]
section_content = '\n'.join(section_lines).strip()
sections.append({
'title': header['title'],
'content': section_content,
'level': header['level']
})
return sections
def _extract_headers(content: str) -> List[Dict[str, Any]]:
"""Extract header hierarchy from markdown content."""
headers = []
lines = content.split('\n')
for line_num, line in enumerate(lines, 1):
stripped = line.strip()
if stripped.startswith('#'):
level = len(stripped) - len(stripped.lstrip('#'))
title = stripped.lstrip('#').strip()
headers.append({
'level': level,
'title': title,
'line': line_num
})
return headers
def _create_outline(headers: List[Dict[str, Any]]) -> List[str]:
"""Create a hierarchical outline from headers."""
outline = []
for header in headers:
indent = " " * (header['level'] - 1)
outline.append(f"{indent}- {header['title']}")
return outline
def _count_code_blocks(content: str) -> int:
"""Count code blocks in markdown content."""
return content.count('```')
def _extract_links(content: str) -> List[str]:
"""Extract links from markdown content."""
import re
# Match markdown links [text](url) and bare URLs
link_pattern = r'\[([^\]]+)\]\(([^)]+)\)|https?://[^\s\])]+'
matches = re.findall(link_pattern, content)
links = []
for match in matches:
if isinstance(match, tuple) and match[1]:
links.append(match[1]) # URL from [text](url)
elif isinstance(match, str):
links.append(match) # Bare URL
return links
def _generate_overview_summary(content: str, sections: List[Dict[str, str]]) -> str:
"""Generate a concise overview summary."""
if not sections:
# If no sections, summarize the whole content
words = content.split()[:100] # First 100 words
return ' '.join(words) + "..." if len(content.split()) > 100 else ' '.join(words)
summary_parts = []
# Process all meaningful sections (skip empty ones)
for section in sections:
title = section['title'].lstrip('#').strip()
section_content = section['content'].strip()
# Skip empty sections
if not section_content:
continue
# For overview, take first 50 words of each section
content_words = section_content.split()[:50]
section_summary = ' '.join(content_words)
if len(section['content'].split()) > 50:
section_summary += "..."
summary_parts.append(f"**{title}**: {section_summary}")
# Limit to 5 sections for overview to avoid too much text
if len(summary_parts) >= 5:
break
# If we still have no content, fall back to first 100 words
if not summary_parts:
words = content.split()[:100]
return ' '.join(words) + "..." if len(content.split()) > 100 else ' '.join(words)
return '\n\n'.join(summary_parts)
def _extract_key_points(content: str, sections: List[Dict[str, str]]) -> str:
"""Extract key points from content."""
key_points = []
# Look for bullet points and numbered lists in sections
for section in sections:
section_content = section['content']
lines = section_content.split('\n')
for line in lines:
stripped = line.strip()
if (stripped.startswith('- ') or
stripped.startswith('* ') or
stripped.startswith('+ ') or
(stripped and len(stripped) > 0 and stripped[0].isdigit() and '. ' in stripped)):
# Clean up the bullet point
clean_point = stripped.lstrip('- *+0123456789. ').strip()
if clean_point:
key_points.append(f"• {clean_point}")
if key_points:
return '\n'.join(key_points[:15]) # Top 15 points
# Fallback: extract sentences that contain key indicators from all content
sentences = content.replace('\n', ' ').split('.')
important_sentences = []
keywords = ['important', 'note', 'warning', 'key', 'must', 'should', 'required', 'avoid', 'best', 'practice']
for sentence in sentences:
sentence = sentence.strip()
if sentence and any(keyword in sentence.lower() for keyword in keywords):
important_sentences.append(f"• {sentence}.")
return '\n'.join(important_sentences[:8]) if important_sentences else "No specific key points identified."
def _generate_detailed_summary(content: str, sections: List[Dict[str, str]]) -> str:
"""Generate a detailed summary with all sections."""
if not sections:
return content[:1500] + "..." if len(content) > 1500 else content
detailed_parts = []
for section in sections:
title = section['title'].lstrip('#').strip()
section_content = section['content'].strip()
# Skip empty sections
if not section_content:
continue
# For detailed summary, include more content
content_preview = section_content[:400]
if len(section_content) > 400:
content_preview += "..."
detailed_parts.append(f"## {title}\n{content_preview}")
# If no sections with content, return truncated full content
if not detailed_parts:
return content[:1500] + "..." if len(content) > 1500 else content
return '\n\n'.join(detailed_parts)
def _extract_technical_details(content: str, sections: List[Dict[str, str]]) -> str:
"""Extract technical details like code, configurations, and specifications."""
technical_parts = []
# Extract code blocks
import re
code_blocks = re.findall(r'```[\s\S]*?```', content)
if code_blocks:
technical_parts.append("**Code Examples:**")
for i, block in enumerate(code_blocks[:3], 1):
technical_parts.append(f"Block {i}: {block[:100]}..." if len(block) > 100 else block)
# Extract technical terms (words in backticks)
tech_terms = re.findall(r'`([^`]+)`', content)
if tech_terms:
unique_terms = list(set(tech_terms))[:10]
technical_parts.append(f"**Technical Terms:** {', '.join(unique_terms)}")
# Look for configuration or specification patterns
config_lines = []
lines = content.split('\n')
for line in lines:
if ('config' in line.lower() or
'setting' in line.lower() or
'=' in line or
':' in line and not line.strip().startswith('#')):
config_lines.append(line.strip())
if config_lines:
technical_parts.append("**Configurations/Settings:**")
technical_parts.extend(config_lines[:5])
return '\n\n'.join(technical_parts) if technical_parts else "No specific technical details identified."
def _generate_brief_summary(content: str) -> str:
"""Generate a very brief summary (1-2 sentences)."""
words = content.split()
if len(words) <= 30:
return content
# Take first sentence or first 30 words
sentences = content.split('.')
first_sentence = sentences[0].strip() + '.' if sentences else ''
if len(first_sentence.split()) <= 30:
return first_sentence
else:
return ' '.join(words[:30]) + "..."
@mcp.resource("docs://list")
def list_docs_resource() -> list[str]:
"""
Resource that returns a simple list of available doc paths.
"""
return [str(p.relative_to(DOCS_ROOT)) for p in _iter_docs()]
@mcp.resource("docs://{relative_path}")
def read_doc(relative_path: str) -> str:
"""
Read a specific doc by relative path (e.g. 'getting-started.md').
"""
path = (DOCS_ROOT / relative_path).resolve()
if not path.exists() or not path.is_file():
return f"Document not found: {relative_path}"
if DOCS_ROOT not in path.parents and DOCS_ROOT != path.parent:
return "Access denied: path escapes docs root."
return _read_file(path)
@mcp.tool()
def list_docs() -> List[str]:
"""
List available documentation files relative to the docs/ folder.
"""
return [str(p.relative_to(DOCS_ROOT)) for p in _iter_docs()]
@mcp.tool()
def search_docs(query: str, max_results: int = 10) -> List[Dict[str, str]]:
"""
Improved full-text search over docs with better matching.
Args:
query: Search query string.
max_results: Max number of matches to return.
Returns:
List of {path, snippet} matches.
"""
import re
query_lower = query.lower()
query_words = query_lower.split()
results: list[dict[str, str]] = []
for path in _iter_docs():
text = _read_file(path)
text_lower = text.lower()
# Score based on how many query words are found
matches = []
# First, try exact phrase match (highest score)
if query_lower in text_lower:
idx = text_lower.find(query_lower)
start = max(0, idx - 80)
end = min(len(text), idx + 80)
snippet = text[start:end].replace("\n", " ")
matches.append({
"score": 100,
"snippet": snippet,
"match_type": "exact_phrase"
})
# Then try to find sentences containing most query words
sentences = re.split(r'[.!?]+|\n\n+', text)
for sentence in sentences:
sentence_lower = sentence.lower()
word_matches = sum(1 for word in query_words if word in sentence_lower)
if word_matches >= max(1, len(query_words) * 0.6): # At least 60% of words
# Calculate score based on word matches and total words
score = (word_matches / len(query_words)) * 80
if len(sentence.strip()) > 20: # Prefer longer, more informative sentences
snippet = sentence.strip()[:160] + "..." if len(sentence.strip()) > 160 else sentence.strip()
matches.append({
"score": score,
"snippet": snippet,
"match_type": f"words_{word_matches}/{len(query_words)}"
})
# Add the best matches for this document
if matches:
# Sort by score and take the best match
best_match = max(matches, key=lambda x: x["score"])
results.append({
"path": str(path.relative_to(DOCS_ROOT)),
"snippet": best_match["snippet"],
"score": str(best_match["score"]),
"match_type": best_match["match_type"]
})
# Sort results by score (highest first) and limit
results.sort(key=lambda x: x["score"], reverse=True)
return results[:max_results]
@mcp.tool()
def extract_section(relative_path: str, section_title: str, include_subsections: bool = True) -> Dict[str, Any]:
"""
Extract a specific section from a document.
Args:
relative_path: Path to the document relative to docs/ folder
section_title: Title of the section to extract (case-insensitive, partial matches allowed)
include_subsections: Whether to include subsections in the extracted content
Returns:
Dictionary with section content and metadata
"""
path = (DOCS_ROOT / relative_path).resolve()
if not path.exists() or not path.is_file():
return {"error": f"Document not found: {relative_path}"}
if DOCS_ROOT not in path.parents and DOCS_ROOT != path.parent:
return {"error": "Access denied: path escapes docs root."}
content = _read_file(path)
# Use hierarchical extraction if including subsections, otherwise flat extraction
if include_subsections:
sections = _extract_hierarchical_sections(content)
else:
sections = _extract_sections(content)
# Find matching section (case-insensitive, partial match)
section_title_lower = section_title.lower()
matching_sections = []
for section in sections:
section_title_clean = section['title'].lstrip('#').strip().lower()
if section_title_lower in section_title_clean or section_title_clean in section_title_lower:
matching_sections.append(section)
if not matching_sections:
# List available sections for user reference
available_sections = [s['title'].lstrip('#').strip() for s in sections if s['content'].strip()]
return {
"error": f"Section '{section_title}' not found",
"available_sections": available_sections[:10], # Limit to first 10 for readability
"total_sections": str(len(available_sections))
}
if len(matching_sections) == 1:
section = matching_sections[0]
result = {
"document": relative_path,
"section_title": section['title'].lstrip('#').strip(),
"content": section['content'].strip(),
"word_count": str(len(section['content'].split())),
"match_type": "single",
"extraction_mode": "hierarchical" if include_subsections else "flat"
}
# Add metadata about subsections if available
if 'includes_subsections' in section:
result["includes_subsections"] = section['includes_subsections']
if 'level' in section:
result["header_level"] = section['level']
return result
else:
# Multiple matches - return all
results = []
for section in matching_sections:
section_info = {
"section_title": section['title'].lstrip('#').strip(),
"content": section['content'].strip(),
"word_count": str(len(section['content'].split()))
}
if 'level' in section:
section_info["header_level"] = section['level']
if 'includes_subsections' in section:
section_info["includes_subsections"] = section['includes_subsections']
results.append(section_info)
return {
"document": relative_path,
"match_type": "multiple",
"matching_sections": results,
"total_matches": str(len(results)),
"extraction_mode": "hierarchical" if include_subsections else "flat"
}
@mcp.tool()
def summarize_document(relative_path: str, summary_type: str = "overview") -> Dict[str, str]:
"""
Generate a smart summary of a specific document.
Args:
relative_path: Path to the document relative to docs/ folder
summary_type: Type of summary - 'overview', 'key_points', 'detailed', or 'technical'
Returns:
Dictionary with document info and structured summary
"""
path = (DOCS_ROOT / relative_path).resolve()
if not path.exists() or not path.is_file():
return {"error": f"Document not found: {relative_path}"}
if DOCS_ROOT not in path.parents and DOCS_ROOT != path.parent:
return {"error": "Access denied: path escapes docs root."}
content = _read_file(path)
word_count = len(content.split())
# Extract key sections based on markdown headers
sections = _extract_sections(content)
# Generate summary based on type
if summary_type == "key_points":
summary = _extract_key_points(content, sections)
elif summary_type == "detailed":
summary = _generate_detailed_summary(content, sections)
elif summary_type == "technical":
summary = _extract_technical_details(content, sections)
else: # overview
summary = _generate_overview_summary(content, sections)
return {
"document": relative_path,
"word_count": str(word_count),
"sections": str(len(sections)),
"summary_type": summary_type,
"summary": summary
}
@mcp.tool()
def analyze_document_structure(relative_path: str) -> Dict[str, Any]:
"""
Analyze the structure and metadata of a document.
Args:
relative_path: Path to the document relative to docs/ folder
Returns:
Dictionary with structural analysis
"""
path = (DOCS_ROOT / relative_path).resolve()
if not path.exists() or not path.is_file():
return {"error": f"Document not found: {relative_path}"}
content = _read_file(path)
# Extract headers and create outline
headers = _extract_headers(content)
sections = _extract_sections(content)
# Basic statistics
lines = content.split('\n')
words = content.split()
# Find code blocks and links
code_blocks = _count_code_blocks(content)
links = _extract_links(content)
return {
"document": relative_path,
"statistics": {
"lines": len(lines),
"words": len(words),
"characters": len(content),
"sections": str(len(sections)),
"code_blocks": code_blocks,
"links": len(links)
},
"structure": {
"headers": headers,
"outline": _create_outline(headers)
},
"content_analysis": {
"has_tables": "| " in content,
"has_images": "![" in content,
"has_code": "```" in content or " " in content,
"external_links": [link for link in links if link.startswith(('http', 'https'))]
}
}
@mcp.tool()
def generate_doc_overview() -> Dict[str, Any]:
"""
Generate a comprehensive overview of the entire documentation set.
Returns:
Dictionary with overall documentation analysis
"""
docs = _iter_docs()
overview = {
"total_documents": str(len(docs)),
"documents_by_type": {},
"total_content": {"words": 0, "lines": 0, "characters": 0},
"structure_analysis": {"sections": 0, "code_blocks": 0},
"document_summaries": []
}
for path in docs:
content = _read_file(path)
ext = path.suffix.lower()
rel_path = str(path.relative_to(DOCS_ROOT))
# Count by type
overview["documents_by_type"][ext] = overview["documents_by_type"].get(ext, 0) + 1
# Aggregate statistics
words = len(content.split())
lines = len(content.split('\n'))
chars = len(content)
overview["total_content"]["words"] += words
overview["total_content"]["lines"] += lines
overview["total_content"]["characters"] += chars
# Structure analysis
sections = len(_extract_sections(content))
code_blocks = _count_code_blocks(content)
overview["structure_analysis"]["sections"] += sections
overview["structure_analysis"]["code_blocks"] += code_blocks
# Brief summary for each doc
brief_summary = _generate_brief_summary(content)
overview["document_summaries"].append({
"path": rel_path,
"words": words,
"sections": sections,
"brief_summary": brief_summary
})
return overview
@mcp.tool()
def semantic_search(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
"""
Perform semantic search across documents using keyword matching and relevance scoring.
Args:
query: Search query
max_results: Maximum number of results to return
Returns:
List of documents with relevance scores and context
"""
query_words = set(query.lower().split())
results = []
for path in _iter_docs():
content = _read_file(path)
content_lower = content.lower()
# Calculate relevance score
score = 0
context_snippets = []
for word in query_words:
word_count = content_lower.count(word)
score += word_count * len(word) # Longer words get higher weight
# Find context for each query word
word_positions = []
start = 0
while True:
pos = content_lower.find(word, start)
if pos == -1:
break
word_positions.append(pos)
start = pos + 1
# Get context snippets around found words
for pos in word_positions[:2]: # Max 2 snippets per word
snippet_start = max(0, pos - 60)
snippet_end = min(len(content), pos + 60)
snippet = content[snippet_start:snippet_end].replace('\n', ' ')
context_snippets.append(snippet)
if score > 0:
# Normalize score by document length
normalized_score = score / len(content.split())
results.append({
'path': str(path.relative_to(DOCS_ROOT)),
'relevance_score': normalized_score,
'context_snippets': context_snippets[:3], # Max 3 snippets
'word_count': len(content.split())
})
# Sort by relevance score
results.sort(key=lambda x: x['relevance_score'], reverse=True)
return results[:max_results]
@mcp.tool()
def compare_documents(doc1_path: str, doc2_path: str) -> Dict[str, Any]:
"""
Compare two documents and identify similarities and differences.
Args:
doc1_path: Path to first document
doc2_path: Path to second document
Returns:
Comparison analysis
"""
path1 = (DOCS_ROOT / doc1_path).resolve()
path2 = (DOCS_ROOT / doc2_path).resolve()
if not path1.exists() or not path2.exists():
return {"error": "One or both documents not found"}
content1 = _read_file(path1)
content2 = _read_file(path2)
# Basic statistics comparison
stats1 = {
"words": len(content1.split()),
"lines": len(content1.split('\n')),
"characters": len(content1)
}
stats2 = {
"words": len(content2.split()),
"lines": len(content2.split('\n')),
"characters": len(content2)
}
# Find common and unique words
words1 = set(word.lower().strip('.,!?;:') for word in content1.split())
words2 = set(word.lower().strip('.,!?;:') for word in content2.split())
common_words = words1.intersection(words2)
unique_to_doc1 = words1 - words2
unique_to_doc2 = words2 - words1
# Extract headers for structure comparison
headers1 = [h['title'] for h in _extract_headers(content1)]
headers2 = [h['title'] for h in _extract_headers(content2)]
return {
"document1": doc1_path,
"document2": doc2_path,
"statistics": {
"doc1": stats1,
"doc2": stats2,
"size_ratio": stats1["words"] / stats2["words"] if stats2["words"] > 0 else float('inf')
},
"content_similarity": {
"common_words_count": len(common_words),
"unique_to_doc1_count": len(unique_to_doc1),
"unique_to_doc2_count": len(unique_to_doc2),
"similarity_ratio": len(common_words) / len(words1.union(words2)) if len(words1.union(words2)) > 0 else 0
},
"structure_comparison": {
"doc1_headers": headers1,
"doc2_headers": headers2,
"common_headers": list(set(headers1).intersection(set(headers2))),
"unique_headers_doc1": list(set(headers1) - set(headers2)),
"unique_headers_doc2": list(set(headers2) - set(headers1))
},
"sample_unique_words": {
"doc1": list(unique_to_doc1)[:10],
"doc2": list(unique_to_doc2)[:10]
}
}
@mcp.tool()
def extract_definitions(relative_path: str) -> Dict[str, Any]:
"""
Extract definitions, terms, and explanations from a document.
Args:
relative_path: Path to the document
Returns:
Extracted definitions and terms
"""
path = (DOCS_ROOT / relative_path).resolve()
if not path.exists():
return {"error": f"Document not found: {relative_path}"}
content = _read_file(path)
definitions = []
# Look for definition patterns
import re
# Pattern 1: "Term: Definition" or "Term - Definition"
definition_patterns = [
r'^([A-Z][^:\-\n]+):\s*(.+)$', # Term: Definition
r'^([A-Z][^:\-\n]+)\s*-\s*(.+)$', # Term - Definition
r'\*\*([^*]+)\*\*:\s*([^\n]+)', # **Term**: Definition
r'`([^`]+)`:\s*([^\n]+)' # `Term`: Definition
]
for pattern in definition_patterns:
matches = re.findall(pattern, content, re.MULTILINE)
for match in matches:
term, definition = match
definitions.append({
"term": term.strip(),
"definition": definition.strip(),
"type": "explicit"
})
# Look for glossary sections
sections = _extract_sections(content)
glossary_terms = []
for section in sections:
if any(keyword in section['title'].lower() for keyword in ['glossary', 'definition', 'terminology', 'terms']):
lines = section['content'].split('\n')
for line in lines:
if ':' in line or '-' in line:
parts = line.split(':') if ':' in line else line.split('-')
if len(parts) == 2:
glossary_terms.append({
"term": parts[0].strip(),
"definition": parts[1].strip(),
"type": "glossary"
})
# Extract technical terms (words in backticks)
tech_terms = re.findall(r'`([^`]+)`', content)
tech_terms_unique = list(set(tech_terms))
return {
"document": relative_path,
"definitions": definitions,
"glossary_terms": glossary_terms,
"technical_terms": tech_terms_unique,
"total_definitions": str(len(definitions) + len(glossary_terms)),
"definition_density": (len(definitions) + len(glossary_terms)) / len(content.split()) if content.split() else 0
}
@mcp.tool()
def generate_table_of_contents(relative_path: str = None) -> Dict[str, Any]:
"""
Generate a table of contents for a specific document or all documents.
Args:
relative_path: Path to specific document, or None for all documents
Returns:
Table of contents structure
"""
if relative_path:
# Single document TOC
path = (DOCS_ROOT / relative_path).resolve()
if not path.exists():
return {"error": f"Document not found: {relative_path}"}
content = _read_file(path)
headers = _extract_headers(content)
return {
"document": relative_path,
"table_of_contents": _create_outline(headers),
"header_count": len(headers),
"max_depth": max([h['level'] for h in headers]) if headers else 0
}
else:
# All documents TOC
all_toc = {}
for path in _iter_docs():
content = _read_file(path)
headers = _extract_headers(content)
rel_path = str(path.relative_to(DOCS_ROOT))
all_toc[rel_path] = {
"outline": _create_outline(headers),
"header_count": len(headers),
"max_depth": max([h['level'] for h in headers]) if headers else 0
}
return {
"type": "complete_documentation_toc",
"documents": all_toc,
"total_documents": str(len(all_toc))
}
@mcp.tool()
def intelligent_summarize(relative_path: str, summary_type: str = "medium", focus_keywords: str = None) -> Dict[str, Any]:
"""
Generate an intelligent summary using advanced text analysis.
Args:
relative_path: Path to the document
summary_type: "short", "medium", or "long"
focus_keywords: Optional comma-separated keywords to focus on
Returns:
Intelligent summary with analysis
"""
path = (DOCS_ROOT / relative_path).resolve()
if not path.exists():
return {"error": f"Document not found: {relative_path}"}
try:
content = _read_file(path)
# Use document intelligence for smart summary
summary_result = doc_intel.generate_smart_summary(content, summary_type)
# Add key concepts
key_concepts = doc_intel.extract_key_concepts(content)
# Add readability analysis
readability = doc_intel.analyze_readability(content)
# If focus keywords provided, highlight relevant sections
focused_content = None
if focus_keywords:
keywords = [k.strip() for k in focus_keywords.split(',')]
# Find sections that contain the keywords
sections = _extract_sections(content)
relevant_sections = []
for section in sections:
if section['content'].strip() and any(keyword.lower() in section['content'].lower() for keyword in keywords):
relevant_sections.append(section['title'].lstrip('#').strip())
focused_content = relevant_sections
return {
"document": relative_path,
"summary": summary_result,
"key_concepts": key_concepts[:10],
"readability": readability,
"focused_sections": focused_content,
"analysis_method": "advanced_intelligence"
}
except Exception as e:
return {
"error": f"Failed to analyze document: {str(e)}",
"document": relative_path,
"fallback_available": True
}
@mcp.tool()
def extract_qa_pairs(relative_path: str = None) -> Dict[str, Any]:
"""
Extract question-answer pairs from documents for FAQ generation.
Args:
relative_path: Specific document path, or None for all documents
Returns:
Extracted Q&A pairs
"""
if relative_path:
path = (DOCS_ROOT / relative_path).resolve()
if not path.exists():
return {"error": f"Document not found: {relative_path}"}
content = _read_file(path)
qa_pairs = doc_intel.extract_questions_and_answers(content)
return {
"document": relative_path,
"qa_pairs": qa_pairs,
"total_pairs": str(len(qa_pairs))
}
else:
# Extract from all documents
all_qa_pairs = {}
total_pairs = 0
for path in _iter_docs():
content = _read_file(path)
qa_pairs = doc_intel.extract_questions_and_answers(content)
if qa_pairs:
rel_path = str(path.relative_to(DOCS_ROOT))
all_qa_pairs[rel_path] = qa_pairs
total_pairs += len(qa_pairs)
return {
"type": "complete_documentation_qa",
"qa_by_document": all_qa_pairs,
"total_pairs": str(total_pairs)
}
@mcp.tool()
def find_related_documents(query: str, max_results: int = 3) -> List[Dict[str, Any]]:
"""
Find documents most related to a query using advanced similarity scoring.
Args:
query: Search query or topic
max_results: Maximum number of related documents to return
Returns:
List of related documents with scores and explanations
"""
all_docs = list(_iter_docs())
related = doc_intel.find_related_content(query, all_docs, max_results)
return {
"query": query,
"related_documents": related,
"total_analyzed": len(all_docs),
"method": "tf-idf_similarity"
}
@mcp.tool()
def analyze_document_gaps() -> Dict[str, Any]:
"""
Analyze the documentation set to identify potential gaps or areas needing improvement.
Returns:
Analysis of documentation completeness and suggestions
"""
all_docs = list(_iter_docs())
analysis = {
"total_documents": len(all_docs),
"coverage_analysis": {},
"recommendations": [],
"content_quality": {},
"structure_issues": []
}
# Analyze each document
total_words = 0
short_docs = []
long_docs = []
low_readability_docs = []
missing_sections = []
common_sections = ['introduction', 'overview', 'getting started', 'configuration', 'examples', 'troubleshooting']
section_coverage = {section: 0 for section in common_sections}
for path in all_docs:
content = _read_file(path)
rel_path = str(path.relative_to(DOCS_ROOT))
# Word count analysis
word_count = len(content.split())
total_words += word_count
if word_count < 100:
short_docs.append(rel_path)
elif word_count > 3000:
long_docs.append(rel_path)
# Readability analysis
readability = doc_intel.analyze_readability(content)
if readability.get('flesch_score', 50) < 30:
low_readability_docs.append(rel_path)
# Section coverage analysis
headers = [h['title'].lower() for h in _extract_headers(content)]
doc_sections = []
for section in common_sections:
if any(section in header for header in headers):
section_coverage[section] += 1
doc_sections.append(section)
missing = [s for s in common_sections if s not in doc_sections]
if missing:
missing_sections.append({"document": rel_path, "missing": missing})
# Generate recommendations
if short_docs:
analysis["recommendations"].append(f"Consider expanding these short documents: {', '.join(short_docs[:3])}")
if low_readability_docs:
analysis["recommendations"].append(f"Improve readability of: {', '.join(low_readability_docs[:3])}")
# Find least covered sections
least_covered = min(section_coverage.values())
missing_section_types = [section for section, count in section_coverage.items() if count <= least_covered]
if missing_section_types:
analysis["recommendations"].append(f"Consider adding {', '.join(missing_section_types)} sections to more documents")
analysis["coverage_analysis"] = {
"average_words_per_doc": total_words / len(all_docs) if all_docs else 0,
"short_documents": short_docs,
"long_documents": long_docs,
"section_coverage": section_coverage
}
analysis["content_quality"] = {
"low_readability": low_readability_docs,
"missing_common_sections": missing_sections
}
return analysis
@mcp.tool()
def generate_documentation_index() -> Dict[str, Any]:
"""
Generate a comprehensive searchable index of all documentation content.
Returns:
Searchable index with topics, concepts, and cross-references
"""
index = {
"concepts": {}, # concept -> [documents]
"topics": {}, # topic -> documents
"cross_references": {}, # document -> related documents
"metadata": {}
}
all_docs = list(_iter_docs())
# Build concept index
all_concepts = {}
for path in all_docs:
content = _read_file(path)
rel_path = str(path.relative_to(DOCS_ROOT))
# Extract concepts from this document
concepts = doc_intel.extract_key_concepts(content, min_frequency=1)
# Add to global concept index
for concept_info in concepts:
concept = concept_info['concept']
if concept not in all_concepts:
all_concepts[concept] = []
all_concepts[concept].append({
"document": rel_path,
"frequency": concept_info['frequency'],
"type": concept_info['type']
})
# Find cross-references (documents with similar concepts)
related_docs = doc_intel.find_related_content(
' '.join([c['concept'] for c in concepts[:5]]),
all_docs,
max_results=3
)
index["cross_references"][rel_path] = [doc['path'] for doc in related_docs if doc['path'] != rel_path]
# Document metadata
headers = _extract_headers(content)
readability = doc_intel.analyze_readability(content)
index["metadata"][rel_path] = {
"word_count": len(content.split()),
"sections": len(headers),
"readability_score": readability.get('flesch_score', 0),
"main_topics": [c['concept'] for c in concepts[:5]]
}
# Filter concepts that appear in multiple documents (more valuable for index)
index["concepts"] = {
concept: docs for concept, docs in all_concepts.items()
if len(docs) > 1 or any(d['frequency'] > 2 for d in docs)
}
# Create topic clusters
topic_clusters = {}
for concept, docs in index["concepts"].items():
if len(docs) >= 2: # Concept appears in multiple docs
topic_clusters[concept] = [doc['document'] for doc in docs]
index["topics"] = topic_clusters
return {
"index": index,
"statistics": {
"total_concepts": len(index["concepts"]),
"total_topics": len(index["topics"]),
"total_documents": len(all_docs),
"avg_cross_references": sum(len(refs) for refs in index["cross_references"].values()) / len(index["cross_references"]) if index["cross_references"] else 0
}
}
if __name__ == "__main__":
# stdio transport keeps it compatible with the official client pattern
mcp.run(transport="stdio")