Spaces:

mackenzietechdocs
/

DocsNavigatorMCP

Sleeping

File size: 43,658 Bytes
# server_docs.py
from __future__ import annotations
from pathlib import Path
from typing import List, Dict, Any

from mcp.server.fastmcp import FastMCP
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
from document_intelligence import DocumentIntelligence

# Import PDF processing library
try:
    import PyPDF2
    PDF_SUPPORT = True
except ImportError:
    PDF_SUPPORT = False
    print("Warning: PyPDF2 not installed. PDF support disabled.")

# Name your server – this is what clients see
mcp = FastMCP("DocsNavigator")

DOCS_ROOT = Path(__file__).parent.parent.parent / "docs"
doc_intel = DocumentIntelligence(DOCS_ROOT)


def _iter_docs() -> list[Path]:
    exts = {".md", ".txt", ".rst"}
    if PDF_SUPPORT:
        exts.add(".pdf")
    return [
        p for p in DOCS_ROOT.rglob("*")
        if p.is_file() and p.suffix.lower() in exts
    ]


def _read_file(path: Path) -> str:
    if path.suffix.lower() == ".pdf":
        return _read_pdf_file(path)
    else:
        return path.read_text(encoding="utf-8", errors="ignore")


def _read_pdf_file(path: Path) -> str:
    """Extract text from PDF file."""
    if not PDF_SUPPORT:
        return f"PDF support not available. Install PyPDF2 to read {path.name}"
    
    try:
        text = ""
        with open(path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            for page_num, page in enumerate(pdf_reader.pages):
                try:
                    page_text = page.extract_text()
                    if page_text:
                        text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
                except Exception as e:
                    text += f"\n--- Page {page_num + 1} (Error reading: {str(e)}) ---\n"
                    
        return text if text.strip() else f"No text could be extracted from {path.name}"
        
    except Exception as e:
        return f"Error reading PDF {path.name}: {str(e)}"


def _extract_hierarchical_sections(content: str) -> List[Dict[str, str]]:
    """Extract sections including their subsections for better content access."""
    lines = content.split('\n')
    headers = []
    
    # Identify all headers
    for i, line in enumerate(lines):
        stripped = line.strip()
        if stripped.startswith('#'):
            level = len(stripped) - len(stripped.lstrip('#'))
            title = stripped.lstrip('#').strip()
            headers.append({
                'title': stripped,
                'clean_title': title,
                'level': level,
                'line_index': i
            })
    
    if not headers:
        return [{'title': 'Document Content', 'content': content.strip()}]
    
    hierarchical_sections = []
    
    # Extract content for each header including subsections
    for i, header in enumerate(headers):
        start_line = header['line_index']
        
        # Find content that belongs to this section (including subsections)
        end_line = len(lines)
        for j in range(i + 1, len(headers)):
            next_header = headers[j]
            # Only stop at headers of the same or higher level (lower number)
            if next_header['level'] <= header['level']:
                end_line = next_header['line_index']
                break
        
        # Extract all content for this section (header + content + subsections)
        section_lines = lines[start_line:end_line]
        section_content = '\n'.join(section_lines).strip()
        
        # Remove the header line itself from content for cleaner output
        if section_content.startswith('#'):
            content_lines = section_content.split('\n')[1:]
            clean_content = '\n'.join(content_lines).strip()
        else:
            clean_content = section_content
        
        hierarchical_sections.append({
            'title': header['title'],
            'content': clean_content,
            'level': header['level'],
            'includes_subsections': any(h['level'] > header['level'] for h in headers[i+1:] if h['line_index'] < end_line)
        })
    
    return hierarchical_sections


def _extract_sections(content: str) -> List[Dict[str, str]]:
    """Extract sections from markdown content based on headers with proper hierarchy."""
    lines = content.split('\n')
    headers = []
    
    # First pass: identify all headers with their positions
    for i, line in enumerate(lines):
        stripped = line.strip()
        if stripped.startswith('#'):
            level = len(stripped) - len(stripped.lstrip('#'))
            title = stripped.lstrip('#').strip()
            headers.append({
                'title': stripped,
                'clean_title': title,
                'level': level,
                'line_index': i
            })
    
    if not headers:
        return [{'title': 'Document Content', 'content': content.strip()}]
    
    sections = []
    
    # Second pass: extract content for each header
    for i, header in enumerate(headers):
        start_line = header['line_index'] + 1
        
        # Find the end of this section (next header of same or higher level)
        end_line = len(lines)
        for j in range(i + 1, len(headers)):
            next_header = headers[j]
            if next_header['level'] <= header['level']:
                end_line = next_header['line_index']
                break
        
        # Extract content for this section
        section_lines = lines[start_line:end_line]
        section_content = '\n'.join(section_lines).strip()
        
        sections.append({
            'title': header['title'],
            'content': section_content,
            'level': header['level']
        })
    
    return sections


def _extract_headers(content: str) -> List[Dict[str, Any]]:
    """Extract header hierarchy from markdown content."""
    headers = []
    lines = content.split('\n')
    
    for line_num, line in enumerate(lines, 1):
        stripped = line.strip()
        if stripped.startswith('#'):
            level = len(stripped) - len(stripped.lstrip('#'))
            title = stripped.lstrip('#').strip()
            headers.append({
                'level': level,
                'title': title,
                'line': line_num
            })
    
    return headers


def _create_outline(headers: List[Dict[str, Any]]) -> List[str]:
    """Create a hierarchical outline from headers."""
    outline = []
    for header in headers:
        indent = "  " * (header['level'] - 1)
        outline.append(f"{indent}- {header['title']}")
    return outline


def _count_code_blocks(content: str) -> int:
    """Count code blocks in markdown content."""
    return content.count('```')


def _extract_links(content: str) -> List[str]:
    """Extract links from markdown content."""
    import re
    # Match markdown links [text](url) and bare URLs
    link_pattern = r'\[([^\]]+)\]\(([^)]+)\)|https?://[^\s\])]+'
    matches = re.findall(link_pattern, content)
    links = []
    for match in matches:
        if isinstance(match, tuple) and match[1]:
            links.append(match[1])  # URL from [text](url)
        elif isinstance(match, str):
            links.append(match)  # Bare URL
    return links


def _generate_overview_summary(content: str, sections: List[Dict[str, str]]) -> str:
    """Generate a concise overview summary."""
    if not sections:
        # If no sections, summarize the whole content
        words = content.split()[:100]  # First 100 words
        return ' '.join(words) + "..." if len(content.split()) > 100 else ' '.join(words)
    
    summary_parts = []
    
    # Process all meaningful sections (skip empty ones)
    for section in sections:
        title = section['title'].lstrip('#').strip()
        section_content = section['content'].strip()
        
        # Skip empty sections
        if not section_content:
            continue
            
        # For overview, take first 50 words of each section
        content_words = section_content.split()[:50]
        section_summary = ' '.join(content_words)
        if len(section['content'].split()) > 50:
            section_summary += "..."
            
        summary_parts.append(f"**{title}**: {section_summary}")
        
        # Limit to 5 sections for overview to avoid too much text
        if len(summary_parts) >= 5:
            break
    
    # If we still have no content, fall back to first 100 words
    if not summary_parts:
        words = content.split()[:100]
        return ' '.join(words) + "..." if len(content.split()) > 100 else ' '.join(words)
    
    return '\n\n'.join(summary_parts)


def _extract_key_points(content: str, sections: List[Dict[str, str]]) -> str:
    """Extract key points from content."""
    key_points = []
    
    # Look for bullet points and numbered lists in sections
    for section in sections:
        section_content = section['content']
        lines = section_content.split('\n')
        
        for line in lines:
            stripped = line.strip()
            if (stripped.startswith('- ') or 
                stripped.startswith('* ') or 
                stripped.startswith('+ ') or
                (stripped and len(stripped) > 0 and stripped[0].isdigit() and '. ' in stripped)):
                # Clean up the bullet point
                clean_point = stripped.lstrip('- *+0123456789. ').strip()
                if clean_point:
                    key_points.append(f"• {clean_point}")
    
    if key_points:
        return '\n'.join(key_points[:15])  # Top 15 points
    
    # Fallback: extract sentences that contain key indicators from all content
    sentences = content.replace('\n', ' ').split('.')
    important_sentences = []
    keywords = ['important', 'note', 'warning', 'key', 'must', 'should', 'required', 'avoid', 'best', 'practice']
    
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence and any(keyword in sentence.lower() for keyword in keywords):
            important_sentences.append(f"• {sentence}.")
    
    return '\n'.join(important_sentences[:8]) if important_sentences else "No specific key points identified."


def _generate_detailed_summary(content: str, sections: List[Dict[str, str]]) -> str:
    """Generate a detailed summary with all sections."""
    if not sections:
        return content[:1500] + "..." if len(content) > 1500 else content
    
    detailed_parts = []
    
    for section in sections:
        title = section['title'].lstrip('#').strip()
        section_content = section['content'].strip()
        
        # Skip empty sections
        if not section_content:
            continue
            
        # For detailed summary, include more content
        content_preview = section_content[:400]
        if len(section_content) > 400:
            content_preview += "..."
            
        detailed_parts.append(f"## {title}\n{content_preview}")
    
    # If no sections with content, return truncated full content
    if not detailed_parts:
        return content[:1500] + "..." if len(content) > 1500 else content
    
    return '\n\n'.join(detailed_parts)


def _extract_technical_details(content: str, sections: List[Dict[str, str]]) -> str:
    """Extract technical details like code, configurations, and specifications."""
    technical_parts = []
    
    # Extract code blocks
    import re
    code_blocks = re.findall(r'```[\s\S]*?```', content)
    if code_blocks:
        technical_parts.append("**Code Examples:**")
        for i, block in enumerate(code_blocks[:3], 1):
            technical_parts.append(f"Block {i}: {block[:100]}..." if len(block) > 100 else block)
    
    # Extract technical terms (words in backticks)
    tech_terms = re.findall(r'`([^`]+)`', content)
    if tech_terms:
        unique_terms = list(set(tech_terms))[:10]
        technical_parts.append(f"**Technical Terms:** {', '.join(unique_terms)}")
    
    # Look for configuration or specification patterns
    config_lines = []
    lines = content.split('\n')
    for line in lines:
        if ('config' in line.lower() or 
            'setting' in line.lower() or 
            '=' in line or 
            ':' in line and not line.strip().startswith('#')):
            config_lines.append(line.strip())
    
    if config_lines:
        technical_parts.append("**Configurations/Settings:**")
        technical_parts.extend(config_lines[:5])
    
    return '\n\n'.join(technical_parts) if technical_parts else "No specific technical details identified."


def _generate_brief_summary(content: str) -> str:
    """Generate a very brief summary (1-2 sentences)."""
    words = content.split()
    if len(words) <= 30:
        return content
    
    # Take first sentence or first 30 words
    sentences = content.split('.')
    first_sentence = sentences[0].strip() + '.' if sentences else ''
    
    if len(first_sentence.split()) <= 30:
        return first_sentence
    else:
        return ' '.join(words[:30]) + "..."


@mcp.resource("docs://list")
def list_docs_resource() -> list[str]:
    """
    Resource that returns a simple list of available doc paths.
    """
    return [str(p.relative_to(DOCS_ROOT)) for p in _iter_docs()]


@mcp.resource("docs://{relative_path}")
def read_doc(relative_path: str) -> str:
    """
    Read a specific doc by relative path (e.g. 'getting-started.md').
    """
    path = (DOCS_ROOT / relative_path).resolve()
    if not path.exists() or not path.is_file():
        return f"Document not found: {relative_path}"
    if DOCS_ROOT not in path.parents and DOCS_ROOT != path.parent:
        return "Access denied: path escapes docs root."
    return _read_file(path)


@mcp.tool()
def list_docs() -> List[str]:
    """
    List available documentation files relative to the docs/ folder.
    """
    return [str(p.relative_to(DOCS_ROOT)) for p in _iter_docs()]


@mcp.tool()
def search_docs(query: str, max_results: int = 10) -> List[Dict[str, str]]:
    """
    Improved full-text search over docs with better matching.

    Args:
        query: Search query string.
        max_results: Max number of matches to return.
    Returns:
        List of {path, snippet} matches.
    """
    import re
    
    query_lower = query.lower()
    query_words = query_lower.split()
    results: list[dict[str, str]] = []

    for path in _iter_docs():
        text = _read_file(path)
        text_lower = text.lower()
        
        # Score based on how many query words are found
        matches = []
        
        # First, try exact phrase match (highest score)
        if query_lower in text_lower:
            idx = text_lower.find(query_lower)
            start = max(0, idx - 80)
            end = min(len(text), idx + 80)
            snippet = text[start:end].replace("\n", " ")
            matches.append({
                "score": 100,
                "snippet": snippet,
                "match_type": "exact_phrase"
            })
        
        # Then try to find sentences containing most query words
        sentences = re.split(r'[.!?]+|\n\n+', text)
        for sentence in sentences:
            sentence_lower = sentence.lower()
            word_matches = sum(1 for word in query_words if word in sentence_lower)
            
            if word_matches >= max(1, len(query_words) * 0.6):  # At least 60% of words
                # Calculate score based on word matches and total words
                score = (word_matches / len(query_words)) * 80
                if len(sentence.strip()) > 20:  # Prefer longer, more informative sentences
                    snippet = sentence.strip()[:160] + "..." if len(sentence.strip()) > 160 else sentence.strip()
                    matches.append({
                        "score": score,
                        "snippet": snippet,
                        "match_type": f"words_{word_matches}/{len(query_words)}"
                    })
        
        # Add the best matches for this document
        if matches:
            # Sort by score and take the best match
            best_match = max(matches, key=lambda x: x["score"])
            results.append({
                "path": str(path.relative_to(DOCS_ROOT)),
                "snippet": best_match["snippet"],
                "score": str(best_match["score"]),
                "match_type": best_match["match_type"]
            })

    # Sort results by score (highest first) and limit
    results.sort(key=lambda x: x["score"], reverse=True)
    return results[:max_results]


@mcp.tool()
def extract_section(relative_path: str, section_title: str, include_subsections: bool = True) -> Dict[str, Any]:
    """
    Extract a specific section from a document.
    
    Args:
        relative_path: Path to the document relative to docs/ folder
        section_title: Title of the section to extract (case-insensitive, partial matches allowed)
        include_subsections: Whether to include subsections in the extracted content
    Returns:
        Dictionary with section content and metadata
    """
    path = (DOCS_ROOT / relative_path).resolve()
    if not path.exists() or not path.is_file():
        return {"error": f"Document not found: {relative_path}"}
    if DOCS_ROOT not in path.parents and DOCS_ROOT != path.parent:
        return {"error": "Access denied: path escapes docs root."}
    
    content = _read_file(path)
    
    # Use hierarchical extraction if including subsections, otherwise flat extraction
    if include_subsections:
        sections = _extract_hierarchical_sections(content)
    else:
        sections = _extract_sections(content)
    
    # Find matching section (case-insensitive, partial match)
    section_title_lower = section_title.lower()
    matching_sections = []
    
    for section in sections:
        section_title_clean = section['title'].lstrip('#').strip().lower()
        if section_title_lower in section_title_clean or section_title_clean in section_title_lower:
            matching_sections.append(section)
    
    if not matching_sections:
        # List available sections for user reference
        available_sections = [s['title'].lstrip('#').strip() for s in sections if s['content'].strip()]
        return {
            "error": f"Section '{section_title}' not found",
            "available_sections": available_sections[:10],  # Limit to first 10 for readability
            "total_sections": str(len(available_sections))
        }
    
    if len(matching_sections) == 1:
        section = matching_sections[0]
        result = {
            "document": relative_path,
            "section_title": section['title'].lstrip('#').strip(),
            "content": section['content'].strip(),
            "word_count": str(len(section['content'].split())),
            "match_type": "single",
            "extraction_mode": "hierarchical" if include_subsections else "flat"
        }
        
        # Add metadata about subsections if available
        if 'includes_subsections' in section:
            result["includes_subsections"] = section['includes_subsections']
        if 'level' in section:
            result["header_level"] = section['level']
            
        return result
    else:
        # Multiple matches - return all
        results = []
        for section in matching_sections:
            section_info = {
                "section_title": section['title'].lstrip('#').strip(),
                "content": section['content'].strip(),
                "word_count": str(len(section['content'].split()))
            }
            if 'level' in section:
                section_info["header_level"] = section['level']
            if 'includes_subsections' in section:
                section_info["includes_subsections"] = section['includes_subsections']
            results.append(section_info)
        
        return {
            "document": relative_path,
            "match_type": "multiple",
            "matching_sections": results,
            "total_matches": str(len(results)),
            "extraction_mode": "hierarchical" if include_subsections else "flat"
        }


@mcp.tool()
def summarize_document(relative_path: str, summary_type: str = "overview") -> Dict[str, str]:
    """
    Generate a smart summary of a specific document.
    
    Args:
        relative_path: Path to the document relative to docs/ folder
        summary_type: Type of summary - 'overview', 'key_points', 'detailed', or 'technical'
    Returns:
        Dictionary with document info and structured summary
    """
    path = (DOCS_ROOT / relative_path).resolve()
    if not path.exists() or not path.is_file():
        return {"error": f"Document not found: {relative_path}"}
    if DOCS_ROOT not in path.parents and DOCS_ROOT != path.parent:
        return {"error": "Access denied: path escapes docs root."}
    
    content = _read_file(path)
    word_count = len(content.split())
    
    # Extract key sections based on markdown headers
    sections = _extract_sections(content)
    
    # Generate summary based on type
    if summary_type == "key_points":
        summary = _extract_key_points(content, sections)
    elif summary_type == "detailed":
        summary = _generate_detailed_summary(content, sections)
    elif summary_type == "technical":
        summary = _extract_technical_details(content, sections)
    else:  # overview
        summary = _generate_overview_summary(content, sections)
    
    return {
        "document": relative_path,
        "word_count": str(word_count),
        "sections": str(len(sections)),
        "summary_type": summary_type,
        "summary": summary
    }


@mcp.tool()
def analyze_document_structure(relative_path: str) -> Dict[str, Any]:
    """
    Analyze the structure and metadata of a document.
    
    Args:
        relative_path: Path to the document relative to docs/ folder
    Returns:
        Dictionary with structural analysis
    """
    path = (DOCS_ROOT / relative_path).resolve()
    if not path.exists() or not path.is_file():
        return {"error": f"Document not found: {relative_path}"}
    
    content = _read_file(path)
    
    # Extract headers and create outline
    headers = _extract_headers(content)
    sections = _extract_sections(content)
    
    # Basic statistics
    lines = content.split('\n')
    words = content.split()
    
    # Find code blocks and links
    code_blocks = _count_code_blocks(content)
    links = _extract_links(content)
    
    return {
        "document": relative_path,
        "statistics": {
            "lines": len(lines),
            "words": len(words),
            "characters": len(content),
            "sections": str(len(sections)),
            "code_blocks": code_blocks,
            "links": len(links)
        },
        "structure": {
            "headers": headers,
            "outline": _create_outline(headers)
        },
        "content_analysis": {
            "has_tables": "| " in content,
            "has_images": "![" in content,
            "has_code": "```" in content or "    " in content,
            "external_links": [link for link in links if link.startswith(('http', 'https'))]
        }
    }


@mcp.tool()
def generate_doc_overview() -> Dict[str, Any]:
    """
    Generate a comprehensive overview of the entire documentation set.
    
    Returns:
        Dictionary with overall documentation analysis
    """
    docs = _iter_docs()
    overview = {
        "total_documents": str(len(docs)),
        "documents_by_type": {},
        "total_content": {"words": 0, "lines": 0, "characters": 0},
        "structure_analysis": {"sections": 0, "code_blocks": 0},
        "document_summaries": []
    }
    
    for path in docs:
        content = _read_file(path)
        ext = path.suffix.lower()
        rel_path = str(path.relative_to(DOCS_ROOT))
        
        # Count by type
        overview["documents_by_type"][ext] = overview["documents_by_type"].get(ext, 0) + 1
        
        # Aggregate statistics
        words = len(content.split())
        lines = len(content.split('\n'))
        chars = len(content)
        
        overview["total_content"]["words"] += words
        overview["total_content"]["lines"] += lines
        overview["total_content"]["characters"] += chars
        
        # Structure analysis
        sections = len(_extract_sections(content))
        code_blocks = _count_code_blocks(content)
        
        overview["structure_analysis"]["sections"] += sections
        overview["structure_analysis"]["code_blocks"] += code_blocks
        
        # Brief summary for each doc
        brief_summary = _generate_brief_summary(content)
        overview["document_summaries"].append({
            "path": rel_path,
            "words": words,
            "sections": sections,
            "brief_summary": brief_summary
        })
    
    return overview


@mcp.tool()
def semantic_search(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
    """
    Perform semantic search across documents using keyword matching and relevance scoring.
    
    Args:
        query: Search query
        max_results: Maximum number of results to return
    Returns:
        List of documents with relevance scores and context
    """
    query_words = set(query.lower().split())
    results = []
    
    for path in _iter_docs():
        content = _read_file(path)
        content_lower = content.lower()
        
        # Calculate relevance score
        score = 0
        context_snippets = []
        
        for word in query_words:
            word_count = content_lower.count(word)
            score += word_count * len(word)  # Longer words get higher weight
            
            # Find context for each query word
            word_positions = []
            start = 0
            while True:
                pos = content_lower.find(word, start)
                if pos == -1:
                    break
                word_positions.append(pos)
                start = pos + 1
            
            # Get context snippets around found words
            for pos in word_positions[:2]:  # Max 2 snippets per word
                snippet_start = max(0, pos - 60)
                snippet_end = min(len(content), pos + 60)
                snippet = content[snippet_start:snippet_end].replace('\n', ' ')
                context_snippets.append(snippet)
        
        if score > 0:
            # Normalize score by document length
            normalized_score = score / len(content.split())
            
            results.append({
                'path': str(path.relative_to(DOCS_ROOT)),
                'relevance_score': normalized_score,
                'context_snippets': context_snippets[:3],  # Max 3 snippets
                'word_count': len(content.split())
            })
    
    # Sort by relevance score
    results.sort(key=lambda x: x['relevance_score'], reverse=True)
    return results[:max_results]


@mcp.tool()
def compare_documents(doc1_path: str, doc2_path: str) -> Dict[str, Any]:
    """
    Compare two documents and identify similarities and differences.
    
    Args:
        doc1_path: Path to first document
        doc2_path: Path to second document
    Returns:
        Comparison analysis
    """
    path1 = (DOCS_ROOT / doc1_path).resolve()
    path2 = (DOCS_ROOT / doc2_path).resolve()
    
    if not path1.exists() or not path2.exists():
        return {"error": "One or both documents not found"}
    
    content1 = _read_file(path1)
    content2 = _read_file(path2)
    
    # Basic statistics comparison
    stats1 = {
        "words": len(content1.split()),
        "lines": len(content1.split('\n')),
        "characters": len(content1)
    }
    stats2 = {
        "words": len(content2.split()),
        "lines": len(content2.split('\n')),
        "characters": len(content2)
    }
    
    # Find common and unique words
    words1 = set(word.lower().strip('.,!?;:') for word in content1.split())
    words2 = set(word.lower().strip('.,!?;:') for word in content2.split())
    
    common_words = words1.intersection(words2)
    unique_to_doc1 = words1 - words2
    unique_to_doc2 = words2 - words1
    
    # Extract headers for structure comparison
    headers1 = [h['title'] for h in _extract_headers(content1)]
    headers2 = [h['title'] for h in _extract_headers(content2)]
    
    return {
        "document1": doc1_path,
        "document2": doc2_path,
        "statistics": {
            "doc1": stats1,
            "doc2": stats2,
            "size_ratio": stats1["words"] / stats2["words"] if stats2["words"] > 0 else float('inf')
        },
        "content_similarity": {
            "common_words_count": len(common_words),
            "unique_to_doc1_count": len(unique_to_doc1),
            "unique_to_doc2_count": len(unique_to_doc2),
            "similarity_ratio": len(common_words) / len(words1.union(words2)) if len(words1.union(words2)) > 0 else 0
        },
        "structure_comparison": {
            "doc1_headers": headers1,
            "doc2_headers": headers2,
            "common_headers": list(set(headers1).intersection(set(headers2))),
            "unique_headers_doc1": list(set(headers1) - set(headers2)),
            "unique_headers_doc2": list(set(headers2) - set(headers1))
        },
        "sample_unique_words": {
            "doc1": list(unique_to_doc1)[:10],
            "doc2": list(unique_to_doc2)[:10]
        }
    }


@mcp.tool()
def extract_definitions(relative_path: str) -> Dict[str, Any]:
    """
    Extract definitions, terms, and explanations from a document.
    
    Args:
        relative_path: Path to the document
    Returns:
        Extracted definitions and terms
    """
    path = (DOCS_ROOT / relative_path).resolve()
    if not path.exists():
        return {"error": f"Document not found: {relative_path}"}
    
    content = _read_file(path)
    definitions = []
    
    # Look for definition patterns
    import re
    
    # Pattern 1: "Term: Definition" or "Term - Definition"
    definition_patterns = [
        r'^([A-Z][^:\-\n]+):\s*(.+)$',  # Term: Definition
        r'^([A-Z][^:\-\n]+)\s*-\s*(.+)$',  # Term - Definition
        r'\*\*([^*]+)\*\*:\s*([^\n]+)',  # **Term**: Definition
        r'`([^`]+)`:\s*([^\n]+)'  # `Term`: Definition
    ]
    
    for pattern in definition_patterns:
        matches = re.findall(pattern, content, re.MULTILINE)
        for match in matches:
            term, definition = match
            definitions.append({
                "term": term.strip(),
                "definition": definition.strip(),
                "type": "explicit"
            })
    
    # Look for glossary sections
    sections = _extract_sections(content)
    glossary_terms = []
    
    for section in sections:
        if any(keyword in section['title'].lower() for keyword in ['glossary', 'definition', 'terminology', 'terms']):
            lines = section['content'].split('\n')
            for line in lines:
                if ':' in line or '-' in line:
                    parts = line.split(':') if ':' in line else line.split('-')
                    if len(parts) == 2:
                        glossary_terms.append({
                            "term": parts[0].strip(),
                            "definition": parts[1].strip(),
                            "type": "glossary"
                        })
    
    # Extract technical terms (words in backticks)
    tech_terms = re.findall(r'`([^`]+)`', content)
    tech_terms_unique = list(set(tech_terms))
    
    return {
        "document": relative_path,
        "definitions": definitions,
        "glossary_terms": glossary_terms,
        "technical_terms": tech_terms_unique,
        "total_definitions": str(len(definitions) + len(glossary_terms)),
        "definition_density": (len(definitions) + len(glossary_terms)) / len(content.split()) if content.split() else 0
    }


@mcp.tool()
def generate_table_of_contents(relative_path: str = None) -> Dict[str, Any]:
    """
    Generate a table of contents for a specific document or all documents.
    
    Args:
        relative_path: Path to specific document, or None for all documents
    Returns:
        Table of contents structure
    """
    if relative_path:
        # Single document TOC
        path = (DOCS_ROOT / relative_path).resolve()
        if not path.exists():
            return {"error": f"Document not found: {relative_path}"}
        
        content = _read_file(path)
        headers = _extract_headers(content)
        
        return {
            "document": relative_path,
            "table_of_contents": _create_outline(headers),
            "header_count": len(headers),
            "max_depth": max([h['level'] for h in headers]) if headers else 0
        }
    else:
        # All documents TOC
        all_toc = {}
        for path in _iter_docs():
            content = _read_file(path)
            headers = _extract_headers(content)
            rel_path = str(path.relative_to(DOCS_ROOT))
            
            all_toc[rel_path] = {
                "outline": _create_outline(headers),
                "header_count": len(headers),
                "max_depth": max([h['level'] for h in headers]) if headers else 0
            }
        
        return {
            "type": "complete_documentation_toc",
            "documents": all_toc,
            "total_documents": str(len(all_toc))
        }


@mcp.tool()
def intelligent_summarize(relative_path: str, summary_type: str = "medium", focus_keywords: str = None) -> Dict[str, Any]:
    """
    Generate an intelligent summary using advanced text analysis.
    
    Args:
        relative_path: Path to the document
        summary_type: "short", "medium", or "long"
        focus_keywords: Optional comma-separated keywords to focus on
    Returns:
        Intelligent summary with analysis
    """
    path = (DOCS_ROOT / relative_path).resolve()
    if not path.exists():
        return {"error": f"Document not found: {relative_path}"}
    
    try:
        content = _read_file(path)
        
        # Use document intelligence for smart summary
        summary_result = doc_intel.generate_smart_summary(content, summary_type)
        
        # Add key concepts
        key_concepts = doc_intel.extract_key_concepts(content)
        
        # Add readability analysis
        readability = doc_intel.analyze_readability(content)
        
        # If focus keywords provided, highlight relevant sections
        focused_content = None
        if focus_keywords:
            keywords = [k.strip() for k in focus_keywords.split(',')]
            # Find sections that contain the keywords
            sections = _extract_sections(content)
            relevant_sections = []
            for section in sections:
                if section['content'].strip() and any(keyword.lower() in section['content'].lower() for keyword in keywords):
                    relevant_sections.append(section['title'].lstrip('#').strip())
            focused_content = relevant_sections
        
        return {
            "document": relative_path,
            "summary": summary_result,
            "key_concepts": key_concepts[:10],
            "readability": readability,
            "focused_sections": focused_content,
            "analysis_method": "advanced_intelligence"
        }
    except Exception as e:
        return {
            "error": f"Failed to analyze document: {str(e)}",
            "document": relative_path,
            "fallback_available": True
        }


@mcp.tool()
def extract_qa_pairs(relative_path: str = None) -> Dict[str, Any]:
    """
    Extract question-answer pairs from documents for FAQ generation.
    
    Args:
        relative_path: Specific document path, or None for all documents
    Returns:
        Extracted Q&A pairs
    """
    if relative_path:
        path = (DOCS_ROOT / relative_path).resolve()
        if not path.exists():
            return {"error": f"Document not found: {relative_path}"}
        
        content = _read_file(path)
        qa_pairs = doc_intel.extract_questions_and_answers(content)
        
        return {
            "document": relative_path,
            "qa_pairs": qa_pairs,
            "total_pairs": str(len(qa_pairs))
        }
    else:
        # Extract from all documents
        all_qa_pairs = {}
        total_pairs = 0
        
        for path in _iter_docs():
            content = _read_file(path)
            qa_pairs = doc_intel.extract_questions_and_answers(content)
            if qa_pairs:
                rel_path = str(path.relative_to(DOCS_ROOT))
                all_qa_pairs[rel_path] = qa_pairs
                total_pairs += len(qa_pairs)
        
        return {
            "type": "complete_documentation_qa",
            "qa_by_document": all_qa_pairs,
            "total_pairs": str(total_pairs)
        }


@mcp.tool()
def find_related_documents(query: str, max_results: int = 3) -> List[Dict[str, Any]]:
    """
    Find documents most related to a query using advanced similarity scoring.
    
    Args:
        query: Search query or topic
        max_results: Maximum number of related documents to return
    Returns:
        List of related documents with scores and explanations
    """
    all_docs = list(_iter_docs())
    related = doc_intel.find_related_content(query, all_docs, max_results)
    
    return {
        "query": query,
        "related_documents": related,
        "total_analyzed": len(all_docs),
        "method": "tf-idf_similarity"
    }


@mcp.tool()
def analyze_document_gaps() -> Dict[str, Any]:
    """
    Analyze the documentation set to identify potential gaps or areas needing improvement.
    
    Returns:
        Analysis of documentation completeness and suggestions
    """
    all_docs = list(_iter_docs())
    analysis = {
        "total_documents": len(all_docs),
        "coverage_analysis": {},
        "recommendations": [],
        "content_quality": {},
        "structure_issues": []
    }
    
    # Analyze each document
    total_words = 0
    short_docs = []
    long_docs = []
    low_readability_docs = []
    missing_sections = []
    
    common_sections = ['introduction', 'overview', 'getting started', 'configuration', 'examples', 'troubleshooting']
    section_coverage = {section: 0 for section in common_sections}
    
    for path in all_docs:
        content = _read_file(path)
        rel_path = str(path.relative_to(DOCS_ROOT))
        
        # Word count analysis
        word_count = len(content.split())
        total_words += word_count
        
        if word_count < 100:
            short_docs.append(rel_path)
        elif word_count > 3000:
            long_docs.append(rel_path)
        
        # Readability analysis
        readability = doc_intel.analyze_readability(content)
        if readability.get('flesch_score', 50) < 30:
            low_readability_docs.append(rel_path)
        
        # Section coverage analysis
        headers = [h['title'].lower() for h in _extract_headers(content)]
        doc_sections = []
        for section in common_sections:
            if any(section in header for header in headers):
                section_coverage[section] += 1
                doc_sections.append(section)
        
        missing = [s for s in common_sections if s not in doc_sections]
        if missing:
            missing_sections.append({"document": rel_path, "missing": missing})
    
    # Generate recommendations
    if short_docs:
        analysis["recommendations"].append(f"Consider expanding these short documents: {', '.join(short_docs[:3])}")
    
    if low_readability_docs:
        analysis["recommendations"].append(f"Improve readability of: {', '.join(low_readability_docs[:3])}")
    
    # Find least covered sections
    least_covered = min(section_coverage.values())
    missing_section_types = [section for section, count in section_coverage.items() if count <= least_covered]
    if missing_section_types:
        analysis["recommendations"].append(f"Consider adding {', '.join(missing_section_types)} sections to more documents")
    
    analysis["coverage_analysis"] = {
        "average_words_per_doc": total_words / len(all_docs) if all_docs else 0,
        "short_documents": short_docs,
        "long_documents": long_docs,
        "section_coverage": section_coverage
    }
    
    analysis["content_quality"] = {
        "low_readability": low_readability_docs,
        "missing_common_sections": missing_sections
    }
    
    return analysis


@mcp.tool()
def generate_documentation_index() -> Dict[str, Any]:
    """
    Generate a comprehensive searchable index of all documentation content.
    
    Returns:
        Searchable index with topics, concepts, and cross-references
    """
    index = {
        "concepts": {},  # concept -> [documents]
        "topics": {},    # topic -> documents
        "cross_references": {},  # document -> related documents
        "metadata": {}
    }
    
    all_docs = list(_iter_docs())
    
    # Build concept index
    all_concepts = {}
    
    for path in all_docs:
        content = _read_file(path)
        rel_path = str(path.relative_to(DOCS_ROOT))
        
        # Extract concepts from this document
        concepts = doc_intel.extract_key_concepts(content, min_frequency=1)
        
        # Add to global concept index
        for concept_info in concepts:
            concept = concept_info['concept']
            if concept not in all_concepts:
                all_concepts[concept] = []
            all_concepts[concept].append({
                "document": rel_path,
                "frequency": concept_info['frequency'],
                "type": concept_info['type']
            })
        
        # Find cross-references (documents with similar concepts)
        related_docs = doc_intel.find_related_content(
            ' '.join([c['concept'] for c in concepts[:5]]), 
            all_docs, 
            max_results=3
        )
        index["cross_references"][rel_path] = [doc['path'] for doc in related_docs if doc['path'] != rel_path]
        
        # Document metadata
        headers = _extract_headers(content)
        readability = doc_intel.analyze_readability(content)
        
        index["metadata"][rel_path] = {
            "word_count": len(content.split()),
            "sections": len(headers),
            "readability_score": readability.get('flesch_score', 0),
            "main_topics": [c['concept'] for c in concepts[:5]]
        }
    
    # Filter concepts that appear in multiple documents (more valuable for index)
    index["concepts"] = {
        concept: docs for concept, docs in all_concepts.items() 
        if len(docs) > 1 or any(d['frequency'] > 2 for d in docs)
    }
    
    # Create topic clusters
    topic_clusters = {}
    for concept, docs in index["concepts"].items():
        if len(docs) >= 2:  # Concept appears in multiple docs
            topic_clusters[concept] = [doc['document'] for doc in docs]
    
    index["topics"] = topic_clusters
    
    return {
        "index": index,
        "statistics": {
            "total_concepts": len(index["concepts"]),
            "total_topics": len(index["topics"]),
            "total_documents": len(all_docs),
            "avg_cross_references": sum(len(refs) for refs in index["cross_references"].values()) / len(index["cross_references"]) if index["cross_references"] else 0
        }
    }


if __name__ == "__main__":
    # stdio transport keeps it compatible with the official client pattern
    mcp.run(transport="stdio")