Spaces:

syedmohaiminulhoque
/

agentic-doc-sim-streamlit

Running

syedmohaiminulhoque commited on about 1 month ago

Commit

fa9ea37

1 Parent(s): c3c7335

feat: Implement Graph RAG pipeline with chunking, vector storage, and graph building

- Added `rag` module with core components:
- `chunker.py`: Implements semantic chunking of documents.
- `vector_store.py`: Integrates ChromaDB for storing and retrieving document chunks.
- `graph_builder.py`: Constructs a knowledge graph from document chunks, establishing relationships based on similarity and section headings.
- `groq_chat.py`: Facilitates chat interactions using Groq API with context from the knowledge graph.
- `rag_pipeline.py`: Orchestrates the entire RAG process, from ingestion to querying.
- Introduced `PipelineState` to manage the state of the RAG pipeline.
- Enhanced document processing with robust text extraction and chunking strategies.
- Added support for entity linking and cross-document similarity in the graph.
- Integrated debug utilities for inspecting raw document attributes.

Files changed (9) hide show

requirements.txt +9 -0
src/rag/__init__.py +12 -0
src/rag/chunker.py +231 -0
src/rag/graph_builder.py +245 -0
src/rag/groq_chat.py +130 -0
src/rag/rag_pipeline.py +149 -0
src/rag/vector_store.py +130 -0
src/streamlit_app.py +249 -316
src/utils/__pycache__/visualization.cpython-313.pyc +0 -0

requirements.txt CHANGED Viewed

@@ -25,6 +25,15 @@ numpy>=1.26.0
 pandas>=2.2.0
 Pillow>=10.2.0
 # Utilities
 python-dotenv>=1.0.0

 pandas>=2.2.0
 Pillow>=10.2.0
+# Vector DB (Graph RAG)
+chromadb>=0.5.0
+# Knowledge Graph (Graph RAG)
+networkx>=3.2.0
+# Groq API (Graph RAG Chat)
+groq>=0.9.0
 # Utilities
 python-dotenv>=1.0.0

src/rag/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .chunker import Chunk, chunk_document, chunk_text
+from .vector_store import VectorStore
+from .graph_builder import GraphBuilder
+from .groq_chat import GroqGraphChat
+__all__ = [
+    "Chunk", "chunk_document", "chunk_text",
+    "VectorStore",
+    "GraphBuilder",
+    "GroqGraphChat",
+]

src/rag/chunker.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""
+Smart Semantic Chunker
+Chunks documents efficiently using sentence boundaries + structural signals.
+"""
+import re
+from typing import List, Dict, Any
+from dataclasses import dataclass, field
+@dataclass
+class Chunk:
+    chunk_id: str
+    doc_id: str          # "doc1" or "doc2"
+    text: str
+    chunk_index: int
+    section: str = ""    # heading/section title if detected
+    page: int = 0
+    metadata: Dict[str, Any] = field(default_factory=dict)
+def _split_sentences(text: str) -> List[str]:
+    """Split text into sentences using regex."""
+    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text.strip())
+    return [s.strip() for s in sentences if s.strip()]
+def _detect_heading(line: str) -> bool:
+    """Detect if a line looks like a section heading."""
+    line = line.strip()
+    if not line:
+        return False
+    if re.match(r'^(\d+[\.\)]\s+|[A-Z][A-Z\s]{3,50}$)', line):
+        return True
+    if len(line) < 80 and not line.endswith('.') and line[0].isupper():
+        if re.match(r'^(Abstract|Introduction|Conclusion|Method|Result|Discussion|Background|Overview|Summary)', line, re.I):
+            return True
+    return False
+def chunk_text(
+    text: str,
+    doc_id: str,
+    chunk_size: int = 300,
+    overlap: int = 50,
+) -> List[Chunk]:
+    """
+    Semantic chunking with section awareness, sentence boundary respect,
+    and sliding window overlap.
+    """
+    chunks = []
+    lines = text.split('\n')
+    current_section = "General"
+    buffer_sentences = []
+    buffer_words = 0
+    chunk_index = 0
+    def flush_buffer(section: str) -> None:
+        nonlocal chunk_index, buffer_sentences, buffer_words
+        if not buffer_sentences:
+            return
+        chunk_text_val = ' '.join(buffer_sentences)
+        chunks.append(Chunk(
+            chunk_id=f"{doc_id}_chunk_{chunk_index}",
+            doc_id=doc_id,
+            text=chunk_text_val,
+            chunk_index=chunk_index,
+            section=section,
+            metadata={"word_count": buffer_words}
+        ))
+        chunk_index += 1
+        overlap_sentences = []
+        overlap_words = 0
+        for sent in reversed(buffer_sentences):
+            w = len(sent.split())
+            if overlap_words + w <= overlap:
+                overlap_sentences.insert(0, sent)
+                overlap_words += w
+            else:
+                break
+        buffer_sentences = overlap_sentences
+        buffer_words = overlap_words
+    paragraph_buffer = []
+    for line in lines:
+        stripped = line.strip()
+        if _detect_heading(stripped):
+            if paragraph_buffer:
+                full_text = ' '.join(paragraph_buffer)
+                sentences = _split_sentences(full_text)
+                for sent in sentences:
+                    buffer_sentences.append(sent)
+                    buffer_words += len(sent.split())
+                    if buffer_words >= chunk_size:
+                        flush_buffer(current_section)
+                paragraph_buffer = []
+            flush_buffer(current_section)
+            current_section = stripped
+            continue
+        if stripped:
+            paragraph_buffer.append(stripped)
+        else:
+            if paragraph_buffer:
+                full_text = ' '.join(paragraph_buffer)
+                sentences = _split_sentences(full_text)
+                for sent in sentences:
+                    buffer_sentences.append(sent)
+                    buffer_words += len(sent.split())
+                    if buffer_words >= chunk_size:
+                        flush_buffer(current_section)
+                paragraph_buffer = []
+    if paragraph_buffer:
+        full_text = ' '.join(paragraph_buffer)
+        sentences = _split_sentences(full_text)
+        for sent in sentences:
+            buffer_sentences.append(sent)
+            buffer_words += len(sent.split())
+    flush_buffer(current_section)
+    return chunks
+# ── Debug helper ──────────────────────────────────────────────────────────────
+def debug_raw_doc(raw_doc) -> str:
+    """Return a string summarising all attributes of a raw_doc for debugging."""
+    lines = [f"Type: {type(raw_doc).__name__}"]
+    try:
+        d = raw_doc.model_dump() if hasattr(raw_doc, 'model_dump') else vars(raw_doc)
+        for k, v in d.items():
+            if isinstance(v, str):
+                lines.append(f"  str attr '{k}': len={len(v)} preview={repr(v[:80])}")
+            elif isinstance(v, list):
+                lines.append(f"  list attr '{k}': len={len(v)}")
+            else:
+                lines.append(f"  attr '{k}': {type(v).__name__} = {repr(str(v)[:60])}")
+    except Exception as e:
+        lines.append(f"  (could not introspect: {e})")
+    return '\n'.join(lines)
+# ── Robust text extraction ────────────────────────────────────────────────────
+def extract_text_from_raw_doc(raw_doc) -> str:
+    """
+    Robustly extract text from whatever RawDocument the ingestion agent returns.
+    Tries all known attribute names and fallback strategies.
+    """
+    # Strategy 1: Common direct string attributes
+    for attr in ['text_content', 'content', 'text', 'raw_text', 'full_text', 'body',
+                 'extracted_text', 'plain_text', 'document_text']:
+        val = getattr(raw_doc, attr, None)
+        if val and isinstance(val, str) and len(val.strip()) > 10:
+            return val.strip()
+    # Strategy 2: List of pages / sections
+    for attr in ['pages', 'sections', 'chunks', 'paragraphs', 'text_chunks']:
+        val = getattr(raw_doc, attr, None)
+        if val and isinstance(val, list):
+            parts = []
+            for item in val:
+                if isinstance(item, str):
+                    parts.append(item)
+                elif hasattr(item, 'text') and isinstance(item.text, str):
+                    parts.append(item.text)
+                elif hasattr(item, 'content') and isinstance(item.content, str):
+                    parts.append(item.content)
+                elif isinstance(item, dict):
+                    parts.append(str(item.get('text') or item.get('content') or ''))
+            combined = '\n'.join(p for p in parts if p.strip())
+            if len(combined.strip()) > 10:
+                return combined.strip()
+    # Strategy 3: Pydantic model_dump / __dict__ — grab longest string field
+    try:
+        d = raw_doc.model_dump() if hasattr(raw_doc, 'model_dump') else vars(raw_doc)
+        # Preferred keys first
+        for key in ['text_content', 'content', 'text', 'raw_text', 'full_text', 'body']:
+            if key in d and isinstance(d[key], str) and len(d[key].strip()) > 10:
+                return d[key].strip()
+        # Any long string
+        best = max(
+            ((k, v) for k, v in d.items() if isinstance(v, str)),
+            key=lambda kv: len(kv[1]),
+            default=(None, ''),
+        )
+        if len(best[1]) > 100:
+            return best[1].strip()
+    except Exception:
+        pass
+    # Strategy 4: str() fallback
+    fallback = str(raw_doc)
+    if len(fallback) > 50 and not fallback.startswith('<'):
+        return fallback
+    return ""
+def chunk_document(raw_doc, doc_id: str, chunk_size: int = 300, overlap: int = 50) -> List[Chunk]:
+    """
+    Chunk a RawDocument object from the ingestion agent.
+    Robustly handles any attribute structure.
+    """
+    text = extract_text_from_raw_doc(raw_doc)
+    if not text:
+        return [Chunk(
+            chunk_id=f"{doc_id}_chunk_0",
+            doc_id=doc_id,
+            text=f"[Could not extract text from {doc_id}. Attributes: {debug_raw_doc(raw_doc)[:200]}]",
+            chunk_index=0,
+            section="Error",
+        )]
+    chunks = chunk_text(text, doc_id, chunk_size, overlap)
+    if not chunks:
+        return [Chunk(
+            chunk_id=f"{doc_id}_chunk_0",
+            doc_id=doc_id,
+            text=text[:500],
+            chunk_index=0,
+            section="General",
+        )]
+    return chunks

src/rag/graph_builder.py ADDED Viewed

	@@ -0,0 +1,245 @@

+"""
+Graph RAG — Knowledge Graph Builder
+Builds a NetworkX graph where:
+  - Nodes  = chunks (from doc1 & doc2)
+  - Edges  = relationships between chunks:
+      * sequential   : consecutive chunks in same document
+      * same_section : chunks sharing the same heading/section
+      * cross_similar: high cosine similarity between doc1 chunk & doc2 chunk
+      * entity_link  : chunks sharing important noun phrases (entities)
+"""
+import re
+import networkx as nx
+from typing import List, Dict, Any, Tuple
+from sentence_transformers import SentenceTransformer
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from .chunker import Chunk
+_EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
+_CROSS_SIM_THRESHOLD = 0.55   # min similarity to create a cross-doc edge
+_ENTITY_MIN_LEN = 4           # min characters for an entity term
+def _extract_noun_phrases(text: str) -> set:
+    """
+    Lightweight noun phrase extraction via regex patterns.
+    No spacy dependency — works in constrained environments.
+    """
+    # Capitalised multi-word phrases and key technical terms
+    patterns = [
+        r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b',   # "Neural Network", "New York"
+        r'\b[A-Z]{2,}\b',                          # acronyms: "RAG", "LLM"
+        r'\b\w{5,}\b',                             # any long word (catch technical terms)
+    ]
+    entities = set()
+    for pat in patterns:
+        found = re.findall(pat, text)
+        entities.update(f.strip().lower() for f in found if len(f) >= _ENTITY_MIN_LEN)
+    # Remove very common stopwords
+    stopwords = {'which', 'these', 'those', 'their', 'there', 'where', 'about',
+                 'would', 'could', 'should', 'other', 'being', 'using', 'having'}
+    return entities - stopwords
+class GraphBuilder:
+    """
+    Builds and queries a knowledge graph from doc chunks.
+    """
+    def __init__(self):
+        self._model = SentenceTransformer(_EMBED_MODEL_NAME)
+        self.graph: nx.Graph = nx.Graph()
+        self._chunk_map: Dict[str, Chunk] = {}   # chunk_id -> Chunk
+    # ------------------------------------------------------------------
+    # Build
+    # ------------------------------------------------------------------
+    def build(self, doc1_chunks: List[Chunk], doc2_chunks: List[Chunk]) -> nx.Graph:
+        """
+        Full graph construction pipeline.
+        Returns the built NetworkX graph.
+        """
+        self.graph = nx.Graph()
+        self._chunk_map = {}
+        all_chunks = doc1_chunks + doc2_chunks
+        # 1. Add nodes
+        for chunk in all_chunks:
+            self._chunk_map[chunk.chunk_id] = chunk
+            self.graph.add_node(
+                chunk.chunk_id,
+                text=chunk.text[:200],   # store snippet
+                doc_id=chunk.doc_id,
+                section=chunk.section,
+                chunk_index=chunk.chunk_index,
+                entities=list(_extract_noun_phrases(chunk.text)),
+            )
+        # 2. Sequential edges (within same doc)
+        self._add_sequential_edges(doc1_chunks)
+        self._add_sequential_edges(doc2_chunks)
+        # 3. Same-section edges
+        self._add_section_edges(all_chunks)
+        # 4. Cross-document similarity edges
+        self._add_cross_similarity_edges(doc1_chunks, doc2_chunks)
+        # 5. Entity co-occurrence edges
+        self._add_entity_edges(all_chunks)
+        return self.graph
+    def _add_sequential_edges(self, chunks: List[Chunk]) -> None:
+        sorted_chunks = sorted(chunks, key=lambda c: c.chunk_index)
+        for i in range(len(sorted_chunks) - 1):
+            a, b = sorted_chunks[i], sorted_chunks[i + 1]
+            self.graph.add_edge(
+                a.chunk_id, b.chunk_id,
+                relation="sequential",
+                weight=0.9,
+            )
+    def _add_section_edges(self, chunks: List[Chunk]) -> None:
+        section_map: Dict[str, List[str]] = {}
+        for chunk in chunks:
+            key = f"{chunk.doc_id}::{chunk.section}"
+            section_map.setdefault(key, []).append(chunk.chunk_id)
+        for ids in section_map.values():
+            for i in range(len(ids)):
+                for j in range(i + 1, len(ids)):
+                    if not self.graph.has_edge(ids[i], ids[j]):
+                        self.graph.add_edge(
+                            ids[i], ids[j],
+                            relation="same_section",
+                            weight=0.6,
+                        )
+    def _add_cross_similarity_edges(
+        self, doc1_chunks: List[Chunk], doc2_chunks: List[Chunk]
+    ) -> None:
+        if not doc1_chunks or not doc2_chunks:
+            return
+        texts1 = [c.text for c in doc1_chunks]
+        texts2 = [c.text for c in doc2_chunks]
+        emb1 = self._model.encode(texts1, batch_size=32, show_progress_bar=False)
+        emb2 = self._model.encode(texts2, batch_size=32, show_progress_bar=False)
+        sim_matrix = cosine_similarity(emb1, emb2)
+        for i, c1 in enumerate(doc1_chunks):
+            for j, c2 in enumerate(doc2_chunks):
+                sim = float(sim_matrix[i, j])
+                if sim >= _CROSS_SIM_THRESHOLD:
+                    self.graph.add_edge(
+                        c1.chunk_id, c2.chunk_id,
+                        relation="cross_similar",
+                        weight=round(sim, 4),
+                        similarity=round(sim, 4),
+                    )
+    def _add_entity_edges(self, chunks: List[Chunk]) -> None:
+        entity_to_chunks: Dict[str, List[str]] = {}
+        for chunk in chunks:
+            entities = _extract_noun_phrases(chunk.text)
+            for ent in entities:
+                entity_to_chunks.setdefault(ent, []).append(chunk.chunk_id)
+        for ent, ids in entity_to_chunks.items():
+            if len(ids) < 2:
+                continue
+            # Only connect cross-doc pairs to avoid too many same-doc entity edges
+            doc_ids = {self._chunk_map[cid].doc_id: cid for cid in ids}
+            if len(doc_ids) >= 2:
+                cids = list(doc_ids.values())
+                for i in range(len(cids)):
+                    for j in range(i + 1, len(cids)):
+                        if not self.graph.has_edge(cids[i], cids[j]):
+                            self.graph.add_edge(
+                                cids[i], cids[j],
+                                relation="entity_link",
+                                entity=ent,
+                                weight=0.5,
+                            )
+    # ------------------------------------------------------------------
+    # Query
+    # ------------------------------------------------------------------
+    def retrieve(
+        self,
+        query: str,
+        seed_chunks: List[Dict[str, Any]],   # from VectorStore.search()
+        hops: int = 2,
+        max_nodes: int = 10,
+    ) -> List[Dict[str, Any]]:
+        """
+        Graph-aware retrieval:
+        1. Start from seed chunk nodes (vector search results)
+        2. Expand via BFS up to `hops` hops, prioritising high-weight edges
+        3. Return unique chunks from both docs, ranked by relevance
+        """
+        visited = set()
+        result_nodes = []
+        seed_ids = [
+            f"{s['doc_id']}_chunk_{s['chunk_index']}"
+            for s in seed_chunks
+            if s.get('chunk_index') is not None
+        ]
+        # BFS queue: (node_id, remaining_hops, accumulated_weight)
+        queue = [(nid, hops, 1.0) for nid in seed_ids if nid in self.graph]
+        while queue and len(result_nodes) < max_nodes:
+            node_id, remaining, acc_weight = queue.pop(0)
+            if node_id in visited:
+                continue
+            visited.add(node_id)
+            chunk = self._chunk_map.get(node_id)
+            if chunk:
+                result_nodes.append({
+                    "chunk_id": node_id,
+                    "text": chunk.text,
+                    "doc_id": chunk.doc_id,
+                    "section": chunk.section,
+                    "relevance": round(acc_weight, 4),
+                })
+            if remaining > 0:
+                neighbors = sorted(
+                    self.graph[node_id].items(),
+                    key=lambda x: x[1].get("weight", 0),
+                    reverse=True,
+                )
+                for neighbor_id, edge_data in neighbors[:4]:   # top-4 neighbours
+                    if neighbor_id not in visited:
+                        queue.append((
+                            neighbor_id,
+                            remaining - 1,
+                            acc_weight * edge_data.get("weight", 0.5),
+                        ))
+        # Sort by relevance
+        result_nodes.sort(key=lambda x: x["relevance"], reverse=True)
+        return result_nodes[:max_nodes]
+    def get_stats(self) -> Dict[str, Any]:
+        edge_types = {}
+        for _, _, data in self.graph.edges(data=True):
+            rel = data.get("relation", "unknown")
+            edge_types[rel] = edge_types.get(rel, 0) + 1
+        return {
+            "nodes": self.graph.number_of_nodes(),
+            "edges": self.graph.number_of_edges(),
+            "edge_types": edge_types,
+        }

src/rag/groq_chat.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+Groq Chat with Graph RAG context injection.
+Uses llama-3.3-70b-versatile (fast + smart) via Groq API.
+"""
+import os
+from typing import List, Dict, Any, Generator
+from groq import Groq
+_DEFAULT_MODEL = "llama-3.3-70b-versatile"
+_MAX_CONTEXT_CHARS = 6000   # stay within context window safely
+def _build_context(retrieved_nodes: List[Dict[str, Any]]) -> str:
+    """
+    Format retrieved graph nodes into a clean context block for the LLM.
+    Groups by document for clarity.
+    """
+    doc1_nodes = [n for n in retrieved_nodes if n.get("doc_id") == "doc1"]
+    doc2_nodes = [n for n in retrieved_nodes if n.get("doc_id") == "doc2"]
+    parts = []
+    if doc1_nodes:
+        parts.append("### Relevant passages from Document 1:")
+        for node in doc1_nodes:
+            sec = f" [{node['section']}]" if node.get("section") else ""
+            parts.append(f"- {node['text'][:500]}{sec}")
+    if doc2_nodes:
+        parts.append("\n### Relevant passages from Document 2:")
+        for node in doc2_nodes:
+            sec = f" [{node['section']}]" if node.get("section") else ""
+            parts.append(f"- {node['text'][:500]}{sec}")
+    context = "\n".join(parts)
+    return context[:_MAX_CONTEXT_CHARS]
+_SYSTEM_PROMPT = """You are an expert document analyst assistant with access to two documents that have been processed, chunked, and indexed using a Knowledge Graph RAG system.
+You will be given:
+1. CONTEXT: Relevant passages retrieved from both documents via graph-enhanced semantic search
+2. USER QUESTION: What the user wants to know
+Your job:
+- Answer using ONLY the provided context
+- Clearly indicate which document (Document 1 or Document 2) information comes from
+- If comparing both documents, highlight similarities and differences
+- If the context doesn't contain the answer, say so honestly
+- Be concise, accurate, and helpful
+"""
+class GroqGraphChat:
+    """
+    Stateful chat session backed by Groq API + GraphRAG context injection.
+    """
+    def __init__(self, api_key: str, model: str = _DEFAULT_MODEL):
+        self._client = Groq(api_key=api_key)
+        self._model = model
+        self._history: List[Dict[str, str]] = []
+    def reset(self) -> None:
+        self._history = []
+    def chat(
+        self,
+        user_query: str,
+        retrieved_nodes: List[Dict[str, Any]],
+        stream: bool = True,
+    ) -> str | Generator:
+        """
+        Send a message with GraphRAG context and get a response.
+        Args:
+            user_query: The user's question
+            retrieved_nodes: Chunks from GraphBuilder.retrieve()
+            stream: If True, returns a generator for streaming UI
+        Returns:
+            Full response string (if stream=False) or generator (if stream=True)
+        """
+        context = _build_context(retrieved_nodes)
+        # Build the user turn with injected context
+        augmented_user_message = f"""<context>
+{context}
+</context>
+<question>
+{user_query}
+</question>"""
+        # Append to history
+        self._history.append({"role": "user", "content": augmented_user_message})
+        messages = [{"role": "system", "content": _SYSTEM_PROMPT}] + self._history
+        if stream:
+            return self._stream_response(messages)
+        else:
+            return self._full_response(messages)
+    def _full_response(self, messages: List[Dict]) -> str:
+        response = self._client.chat.completions.create(
+            model=self._model,
+            messages=messages,
+            max_tokens=1024,
+            temperature=0.3,
+        )
+        answer = response.choices[0].message.content
+        self._history.append({"role": "assistant", "content": answer})
+        return answer
+    def _stream_response(self, messages: List[Dict]) -> Generator:
+        stream = self._client.chat.completions.create(
+            model=self._model,
+            messages=messages,
+            max_tokens=1024,
+            temperature=0.3,
+            stream=True,
+        )
+        full_response = ""
+        for chunk in stream:
+            delta = chunk.choices[0].delta.content or ""
+            full_response += delta
+            yield delta
+        self._history.append({"role": "assistant", "content": full_response})

src/rag/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+RAG Pipeline — wires everything together.
+Used by the Streamlit chat tab.
+"""
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+from .chunker import Chunk, chunk_document
+from .vector_store import VectorStore
+from .graph_builder import GraphBuilder
+from .groq_chat import GroqGraphChat
+@dataclass
+class PipelineState:
+    """Holds the built RAG state after ingestion."""
+    doc1_chunks: List[Chunk] = field(default_factory=list)
+    doc2_chunks: List[Chunk] = field(default_factory=list)
+    vector_store: Optional[VectorStore] = None
+    graph_builder: Optional[GraphBuilder] = None
+    is_ready: bool = False
+    stats: Dict[str, Any] = field(default_factory=dict)
+class GraphRAGPipeline:
+    """
+    End-to-end Graph RAG pipeline.
+    Usage:
+        pipeline = GraphRAGPipeline(groq_api_key="...")
+        state = pipeline.ingest(raw_doc1, raw_doc2)
+        answer = pipeline.query("What does doc1 say about climate?", state)
+    """
+    def __init__(
+        self,
+        groq_api_key: str,
+        chunk_size: int = 300,
+        chunk_overlap: int = 50,
+        top_k_vector: int = 5,
+        graph_hops: int = 2,
+        graph_max_nodes: int = 10,
+    ):
+        self.groq_api_key = groq_api_key
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.top_k_vector = top_k_vector
+        self.graph_hops = graph_hops
+        self.graph_max_nodes = graph_max_nodes
+        self._chat: Optional[GroqGraphChat] = None
+    # ------------------------------------------------------------------
+    # Ingestion
+    # ------------------------------------------------------------------
+    def ingest(self, raw_doc1, raw_doc2) -> PipelineState:
+        """
+        Process both documents: chunk → embed → store → build graph.
+        Returns a PipelineState that should be stored in st.session_state.
+        """
+        state = PipelineState()
+        # 1. Chunk
+        state.doc1_chunks = chunk_document(
+            raw_doc1, "doc1", self.chunk_size, self.chunk_overlap
+        )
+        state.doc2_chunks = chunk_document(
+            raw_doc2, "doc2", self.chunk_size, self.chunk_overlap
+        )
+        # 2. Vector store
+        state.vector_store = VectorStore()
+        state.vector_store.add_chunks(state.doc1_chunks)
+        state.vector_store.add_chunks(state.doc2_chunks)
+        # 3. Knowledge graph
+        state.graph_builder = GraphBuilder()
+        state.graph_builder.build(state.doc1_chunks, state.doc2_chunks)
+        # 4. Stats
+        graph_stats = state.graph_builder.get_stats()
+        state.stats = {
+            "doc1_chunks": len(state.doc1_chunks),
+            "doc2_chunks": len(state.doc2_chunks),
+            "total_vectors": state.vector_store.count(),
+            **graph_stats,
+        }
+        state.is_ready = True
+        # 5. Fresh chat session
+        self._chat = GroqGraphChat(api_key=self.groq_api_key)
+        return state
+    # ------------------------------------------------------------------
+    # Query
+    # ------------------------------------------------------------------
+    def query(
+        self,
+        user_query: str,
+        state: PipelineState,
+        stream: bool = True,
+    ):
+        """
+        Retrieve relevant context via vector + graph search,
+        then pass to Groq for generation.
+        """
+        if not state.is_ready:
+            raise RuntimeError("Pipeline not ready. Call ingest() first.")
+        # Step 1: Vector search (both docs)
+        seed_chunks = state.vector_store.search(
+            user_query, n_results=self.top_k_vector
+        )
+        # Step 2: Graph expansion
+        retrieved_nodes = state.graph_builder.retrieve(
+            query=user_query,
+            seed_chunks=seed_chunks,
+            hops=self.graph_hops,
+            max_nodes=self.graph_max_nodes,
+        )
+        # Fallback: if graph expansion returned nothing, use raw vector results
+        if not retrieved_nodes:
+            retrieved_nodes = [
+                {
+                    "chunk_id": f"{s['doc_id']}_chunk_{s['chunk_index']}",
+                    "text": s["text"],
+                    "doc_id": s["doc_id"],
+                    "section": s.get("section", ""),
+                    "relevance": s["score"],
+                }
+                for s in seed_chunks
+            ]
+        # Step 3: Generate answer via Groq
+        return self._chat.chat(
+            user_query=user_query,
+            retrieved_nodes=retrieved_nodes,
+            stream=stream,
+        )
+    def reset_chat(self) -> None:
+        """Clear conversation history (keep the indexed data)."""
+        if self._chat:
+            self._chat.reset()

src/rag/vector_store.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+Vector Store using ChromaDB (in-memory, HF Spaces compatible)
+Stores and retrieves chunks from both documents via semantic search.
+"""
+import chromadb
+from chromadb.config import Settings
+from sentence_transformers import SentenceTransformer
+from typing import List, Dict, Any, Optional
+import hashlib
+from .chunker import Chunk
+_EMBED_MODEL_NAME = "all-MiniLM-L6-v2"   # fast, small, works great
+class VectorStore:
+    """
+    Wraps ChromaDB with a SentenceTransformer embedding function.
+    Collection name: 'doc_chunks' — shared for both documents.
+    """
+    def __init__(self, persist_dir: Optional[str] = None):
+        self._model = SentenceTransformer(_EMBED_MODEL_NAME)
+        if persist_dir:
+            self._client = chromadb.PersistentClient(path=persist_dir)
+        else:
+            self._client = chromadb.EphemeralClient()
+        self._collection = self._client.get_or_create_collection(
+            name="doc_chunks",
+            metadata={"hnsw:space": "cosine"},
+        )
+    # ------------------------------------------------------------------
+    # Write
+    # ------------------------------------------------------------------
+    def add_chunks(self, chunks: List[Chunk]) -> None:
+        """Embed and upsert chunks into the collection."""
+        if not chunks:
+            return
+        texts = [c.text for c in chunks]
+        embeddings = self._model.encode(texts, batch_size=32, show_progress_bar=False).tolist()
+        ids = [c.chunk_id for c in chunks]
+        metadatas = [
+            {
+                "doc_id": c.doc_id,
+                "chunk_index": c.chunk_index,
+                "section": c.section,
+                "page": c.page,
+                **{k: str(v) for k, v in c.metadata.items()},
+            }
+            for c in chunks
+        ]
+        self._collection.upsert(
+            ids=ids,
+            embeddings=embeddings,
+            documents=texts,
+            metadatas=metadatas,
+        )
+    def clear(self) -> None:
+        """Remove all chunks (useful for re-ingestion)."""
+        self._client.delete_collection("doc_chunks")
+        self._collection = self._client.get_or_create_collection(
+            name="doc_chunks",
+            metadata={"hnsw:space": "cosine"},
+        )
+    # ------------------------------------------------------------------
+    # Read
+    # ------------------------------------------------------------------
+    def search(
+        self,
+        query: str,
+        n_results: int = 5,
+        doc_filter: Optional[str] = None,   # "doc1" | "doc2" | None
+    ) -> List[Dict[str, Any]]:
+        """
+        Semantic search over stored chunks.
+        Returns list of dicts with keys: text, doc_id, section, score.
+        """
+        query_embedding = self._model.encode([query]).tolist()
+        where = {"doc_id": doc_filter} if doc_filter else None
+        results = self._collection.query(
+            query_embeddings=query_embedding,
+            n_results=min(n_results, self._collection.count() or 1),
+            where=where,
+            include=["documents", "metadatas", "distances"],
+        )
+        hits = []
+        for text, meta, dist in zip(
+            results["documents"][0],
+            results["metadatas"][0],
+            results["distances"][0],
+        ):
+            hits.append({
+                "text": text,
+                "doc_id": meta.get("doc_id"),
+                "section": meta.get("section", ""),
+                "chunk_index": meta.get("chunk_index", -1),
+                "score": round(1 - dist, 4),   # cosine similarity
+            })
+        return hits
+    def count(self) -> int:
+        return self._collection.count()
+    def get_all_chunks_for_doc(self, doc_id: str) -> List[Dict[str, Any]]:
+        """Retrieve all stored chunks for a given document."""
+        results = self._collection.get(
+            where={"doc_id": doc_id},
+            include=["documents", "metadatas"],
+        )
+        items = []
+        for text, meta in zip(results["documents"], results["metadatas"]):
+            items.append({"text": text, **meta})
+        # Sort by chunk_index
+        items.sort(key=lambda x: int(x.get("chunk_index", 0)))
+        return items

src/streamlit_app.py CHANGED Viewed

@@ -1,10 +1,11 @@
 """
 Multi-Agent Document Comparison Streamlit App
 """
 import sys
 from pathlib import Path
-# Add project root to Python path for imports
 project_root = Path(__file__).parent
 if str(project_root) not in sys.path:
     sys.path.insert(0, str(project_root))
@@ -13,7 +14,6 @@ import streamlit as st
 import asyncio
 import json
-# Import agents and utilities
 from agents.ingestion_agent import IngestionAgent
 from agents.text_agent import TextAgent
 from agents.table_agent import TableAgent
@@ -28,7 +28,10 @@ from utils.visualization import (
 from models.document import ProcessedDocument
 import config
-# Phase 2 imports (conditional based on availability)
 try:
     from agents.image_agent import ImageAgent
     IMAGE_AGENT_AVAILABLE = True
@@ -48,7 +51,6 @@ except ImportError:
     META_AGENT_AVAILABLE = False
-# Page configuration
 st.set_page_config(
     page_title="Multi-Agent Document Comparator",
     page_icon="📄",
@@ -58,448 +60,379 @@ st.set_page_config(
 def main():
-    """Main application function."""
-    # Header
-    st.title("📄 Multi-Agent Document Comparator")
-    st.markdown("**An agentic system to accurately match document similarity**")
-    # Show architecture diagram
     with st.expander("🏗️ View System Architecture", expanded=False):
         arch_path = Path("src/img/multi_agent_doc_similarity_architecture.svg")
         if arch_path.exists():
             st.image(str(arch_path), use_container_width=True)
-        else:
-            st.info("Architecture diagram not found")
     st.markdown("---")
-    # Sidebar configuration
     with st.sidebar:
         st.header("⚙️ Configuration")
-        # Phase 2 feature toggles
         st.subheader("Phase 2 Features")
         enable_phase2 = st.checkbox(
             "Enable Phase 2 Modalities",
             value=config.ENABLE_IMAGE_COMPARISON,
             help="Enable image, layout, and metadata comparison"
         )
-        # Modality weights
         st.markdown("---")
         st.subheader("Modality Weights")
         if enable_phase2:
-            # Phase 2: All 5 modalities
-            text_weight = st.slider(
-                "Text Weight",
-                min_value=0.0,
-                max_value=1.0,
-                value=config.MODALITY_WEIGHTS["text"],
-                step=0.05
-            )
-            table_weight = st.slider(
-                "Table Weight",
-                min_value=0.0,
-                max_value=1.0,
-                value=config.MODALITY_WEIGHTS["table"],
-                step=0.05
-            )
-            image_weight = st.slider(
-                "Image Weight",
-                min_value=0.0,
-                max_value=1.0,
-                value=config.MODALITY_WEIGHTS["image"],
-                step=0.05
-            )
-            layout_weight = st.slider(
-                "Layout Weight",
-                min_value=0.0,
-                max_value=1.0,
-                value=config.MODALITY_WEIGHTS["layout"],
-                step=0.05
-            )
-            metadata_weight = st.slider(
-                "Metadata Weight",
-                min_value=0.0,
-                max_value=1.0,
-                value=config.MODALITY_WEIGHTS["metadata"],
-                step=0.05
-            )
-            # Normalize weights to sum to 1.0
-            total_weight = text_weight + table_weight + image_weight + layout_weight + metadata_weight
-            if total_weight > 0:
                 weights = {
-                    "text": text_weight / total_weight,
-                    "table": table_weight / total_weight,
-                    "image": image_weight / total_weight,
-                    "layout": layout_weight / total_weight,
-                    "metadata": metadata_weight / total_weight
                 }
             else:
                 weights = config.MODALITY_WEIGHTS
-            st.info(f"Weights normalized to sum to 1.0")
         else:
-            # Phase 1: Only text and tables
-            text_weight = st.slider(
-                "Text Weight",
-                min_value=0.0,
-                max_value=1.0,
-                value=config.MODALITY_WEIGHTS_PHASE1["text"],
-                step=0.05
-            )
             table_weight = 1.0 - text_weight
             st.write(f"Table Weight: {table_weight:.2f}")
             weights = {"text": text_weight, "table": table_weight}
-        # Phase status
         st.markdown("---")
-        st.subheader("📋 Implementation Status")
         st.write("✅ Text comparison")
         st.write("✅ Table comparison")
         if enable_phase2:
-            st.write(f"{'✅' if IMAGE_AGENT_AVAILABLE else '⚠️'} Image comparison")
             st.write(f"{'✅' if LAYOUT_AGENT_AVAILABLE else '⚠️'} Layout comparison")
-            st.write(f"{'✅' if META_AGENT_AVAILABLE else '⚠️'} Metadata comparison")
         else:
-            st.write("⏸️ Image comparison (disabled)")
-            st.write("⏸️ Layout comparison (disabled)")
-            st.write("⏸️ Metadata comparison (disabled)")
-    # Main content area
-    col1, col2 = st.columns(2)
-    with col1:
-        st.subheader("📤 Document 1 (Main)")
-        uploaded_file1 = st.file_uploader(
-            "Upload PDF or DOCX",
-            type=["pdf", "docx"],
-            key="file1",
-            help="Maximum file size: 50MB"
-        )
-    with col2:
-        st.subheader("📤 Document 2 (Comparison)")
-        uploaded_file2 = st.file_uploader(
-            "Upload PDF or DOCX",
-            type=["pdf", "docx"],
-            key="file2",
-            help="Maximum file size: 50MB"
         )
-    # Compare button
-    st.markdown("---")
-    if st.button("🔍 Compare Documents", type="primary", use_container_width=True):
-        if not uploaded_file1 or not uploaded_file2:
-            st.error("Please upload both documents before comparing.")
-            return
-        # Process documents and compare
-        with st.spinner("Processing documents..."):
-            try:
-                # Save uploaded files
-                file1_path = save_uploaded_file(uploaded_file1)
-                file2_path = save_uploaded_file(uploaded_file2)
-                # Validate files
-                valid1, error1 = validate_file(file1_path)
-                valid2, error2 = validate_file(file2_path)
-                if not valid1:
-                    st.error(f"Document 1 error: {error1}")
-                    return
-                if not valid2:
-                    st.error(f"Document 2 error: {error2}")
-                    return
-                # Process documents
-                report = asyncio.run(process_and_compare(
-                    file1_path,
-                    file2_path,
-                    weights,
-                    enable_phase2
-                ))
-                # Display results
-                display_results(report)
-            except Exception as e:
-                st.error(f"An error occurred: {str(e)}")
-                import traceback
-                st.code(traceback.format_exc())
-async def process_and_compare(file1_path: str, file2_path: str, weights: dict, enable_phase2: bool = False):
-    """
-    Process two documents and compare them.
-    Args:
-        file1_path: Path to first document
-        file2_path: Path to second document
-        weights: Modality weights
-        enable_phase2: Enable Phase 2 modalities (image, layout, metadata)
-    Returns:
-        SimilarityReport
-    """
-    # Initialize agents
     ingestion_agent = IngestionAgent()
-    text_agent = TextAgent()
-    table_agent = TableAgent()
-    orchestrator = SimilarityOrchestrator(weights=weights)
-    # Phase 2 agents (conditional)
-    image_agent = ImageAgent() if enable_phase2 and IMAGE_AGENT_AVAILABLE else None
     layout_agent = LayoutAgent() if enable_phase2 and LAYOUT_AGENT_AVAILABLE else None
-    meta_agent = MetaAgent() if enable_phase2 and META_AGENT_AVAILABLE else None
-    # Progress tracking
     progress_bar = st.progress(0)
-    status_text = st.empty()
-    # Step 1: Ingest documents
     status_text.text("⏳ Ingesting documents...")
     progress_bar.progress(10)
     raw_doc1 = await ingestion_agent.process(file1_path)
     raw_doc2 = await ingestion_agent.process(file2_path)
     progress_bar.progress(15)
-    # Step 2: Extract text
-    status_text.text("⏳ Extracting and embedding text...")
     text_chunks1, text_embeddings1 = await text_agent.process(raw_doc1)
     text_chunks2, text_embeddings2 = await text_agent.process(raw_doc2)
     progress_bar.progress(30)
-    # Step 3: Extract tables
-    status_text.text("⏳ Extracting and embedding tables...")
     tables1, table_embeddings1 = await table_agent.process(raw_doc1)
     tables2, table_embeddings2 = await table_agent.process(raw_doc2)
     progress_bar.progress(45)
-    # Phase 2: Extract images
-    images1, image_embeddings1 = [], None
-    images2, image_embeddings2 = [], None
     if image_agent:
-        status_text.text("⏳ Extracting and embedding images...")
         try:
             images1, image_embeddings1 = await image_agent.process(raw_doc1)
             images2, image_embeddings2 = await image_agent.process(raw_doc2)
         except Exception as e:
             st.warning(f"Image extraction failed: {e}")
     progress_bar.progress(60)
-    # Phase 2: Extract layout
-    layout1, layout2 = None, None
     if layout_agent:
-        status_text.text("⏳ Analyzing document structure...")
         try:
             layout1 = await layout_agent.process(raw_doc1)
             layout2 = await layout_agent.process(raw_doc2)
         except Exception as e:
             st.warning(f"Layout analysis failed: {e}")
     progress_bar.progress(70)
-    # Phase 2: Extract metadata
-    metadata1, metadata2 = None, None
     if meta_agent:
-        status_text.text("⏳ Extracting metadata...")
         try:
             metadata1 = await meta_agent.process(raw_doc1)
             metadata2 = await meta_agent.process(raw_doc2)
         except Exception as e:
             st.warning(f"Metadata extraction failed: {e}")
     progress_bar.progress(80)
-    # Create processed documents
     processed_doc1 = ProcessedDocument(
-        filename=raw_doc1.filename,
-        text_chunks=text_chunks1,
-        tables=tables1,
-        total_pages=raw_doc1.total_pages,
-        file_type=raw_doc1.file_type,
-        images=images1,
-        layout=layout1,
-        metadata=metadata1
     )
     processed_doc2 = ProcessedDocument(
-        filename=raw_doc2.filename,
-        text_chunks=text_chunks2,
-        tables=tables2,
-        total_pages=raw_doc2.total_pages,
-        file_type=raw_doc2.file_type,
-        images=images2,
-        layout=layout2,
-        metadata=metadata2
     )
-    # Compare documents
-    status_text.text("⏳ Comparing documents...")
     report = await orchestrator.compare_documents(
-        processed_doc1,
-        text_embeddings1,
-        table_embeddings1,
-        processed_doc2,
-        text_embeddings2,
-        table_embeddings2,
-        # Phase 2 parameters
-        image_embeddings1,
-        image_embeddings2,
-        layout1,
-        layout2,
-        metadata1,
-        metadata2
     )
     progress_bar.progress(100)
     status_text.text("✅ Comparison complete!")
-    return report
 def display_results(report):
-    """
-    Display comparison results.
-    Args:
-        report: SimilarityReport object
-    """
     st.markdown("---")
     st.header("📊 Comparison Results")
-    # Overall similarity gauge
     col1, col2 = st.columns([1, 1])
     with col1:
         gauge_fig = create_similarity_gauge(report.overall_score)
         st.plotly_chart(gauge_fig, use_container_width=True)
     with col2:
         st.markdown(create_score_legend())
-    # Modality breakdown
     st.markdown("---")
     st.subheader("📈 Per-Modality Breakdown")
     breakdown_fig = create_modality_breakdown_chart(report)
     st.plotly_chart(breakdown_fig, use_container_width=True)
-    # Detailed scores
     cols = st.columns(5)
-    with cols[0]:
-        if report.text_score:
-            st.metric(
-                "Text Similarity",
-                f"{report.text_score.score:.1%}",
-                f"{report.text_score.details.get('num_matches', 0)} matches"
-            )
-    with cols[1]:
-        if report.table_score:
-            st.metric(
-                "Table Similarity",
-                f"{report.table_score.score:.1%}",
-                f"{report.table_score.details.get('num_matches', 0)} matches"
-            )
-    with cols[2]:
-        if report.image_score:
-            st.metric(
-                "Image Similarity",
-                f"{report.image_score.score:.1%}",
-                f"{report.image_score.details.get('num_matches', 0)} matches"
-            )
-    with cols[3]:
-        if report.layout_score:
-            st.metric(
-                "Layout Similarity",
-                f"{report.layout_score.score:.1%}",
-                f"{report.layout_score.details.get('num_metrics', 0)} metrics"
-            )
-    with cols[4]:
-        if report.metadata_score:
-            st.metric(
-                "Metadata Similarity",
-                f"{report.metadata_score.score:.1%}",
-                f"{report.metadata_score.details.get('num_fields_compared', 0)} fields"
-            )
-    # Matched sections
     st.markdown("---")
     st.subheader("🔗 Top Matched Sections")
     if report.matched_sections:
-        formatted_sections = format_matched_sections(report.matched_sections[:10])
-        st.markdown(formatted_sections)
     else:
-        st.info("No significant matches found between documents.")
-    # Phase 2: Additional modality details
     if report.image_score or report.layout_score or report.metadata_score:
         st.markdown("---")
         st.subheader("🎨 Phase 2 Modality Details")
-        # Image matches
         if report.image_score and report.image_score.matched_items:
-            with st.expander(f"🖼️ Image Matches ({len(report.image_score.matched_items)} found)", expanded=False):
-                for idx, match in enumerate(report.image_score.matched_items[:5], 1):
-                    st.markdown(f"**Match {idx}** - Similarity: {match['similarity']:.2%}")
-                    st.write(f"Doc1: Page {match['doc1_page']}, Size: {match['doc1_size']}")
-                    st.write(f"Doc2: Page {match['doc2_page']}, Size: {match['doc2_size']}")
-                    st.markdown("---")
-        # Layout details
         if report.layout_score:
-            with st.expander(f"📐 Layout Analysis (Score: {report.layout_score.score:.1%})", expanded=False):
-                for metric, value in report.layout_score.details.items():
-                    if metric != "num_metrics":
-                        st.metric(metric.replace("_", " ").title(), f"{value:.2%}")
-        # Metadata matches
         if report.metadata_score and report.metadata_score.matched_items:
-            with st.expander(f"📋 Metadata Comparison ({len(report.metadata_score.matched_items)} fields)", expanded=False):
-                for match in report.metadata_score.matched_items:
-                    st.markdown(f"**{match['field'].title()}** - Similarity: {match['similarity']:.2%}")
-                    col1, col2 = st.columns(2)
-                    with col1:
-                        st.write(f"Doc1: {match['doc1_value']}")
-                    with col2:
-                        st.write(f"Doc2: {match['doc2_value']}")
-                    st.markdown("---")
-    # Download report
     st.markdown("---")
     report_json = json.dumps(report.model_dump(), indent=2, default=str)
-    col1, col2, col3 = st.columns([1, 1, 2])
-    with col1:
-        st.download_button(
-            label="📥 Download Report (JSON)",
-            data=report_json,
-            file_name=f"similarity_report_{report.timestamp.strftime('%Y%m%d_%H%M%S')}.json",
-            mime="application/json"
-        )
 if __name__ == "__main__":

 """
 Multi-Agent Document Comparison Streamlit App
++ Graph RAG Chat Tab (new)
 """
 import sys
+import os
 from pathlib import Path
 project_root = Path(__file__).parent
 if str(project_root) not in sys.path:
     sys.path.insert(0, str(project_root))
 import asyncio
 import json
 from agents.ingestion_agent import IngestionAgent
 from agents.text_agent import TextAgent
 from agents.table_agent import TableAgent
 from models.document import ProcessedDocument
 import config
+# Graph RAG imports
+from rag.rag_pipeline import GraphRAGPipeline, PipelineState
+# Phase 2 imports (conditional)
 try:
     from agents.image_agent import ImageAgent
     IMAGE_AGENT_AVAILABLE = True
     META_AGENT_AVAILABLE = False
 st.set_page_config(
     page_title="Multi-Agent Document Comparator",
     page_icon="📄",
 def main():
+    st.title("📄 Multi-Agent Document Comparator + Graph RAG Chat")
+    st.markdown("**Agentic document similarity · Knowledge Graph RAG · Groq-powered chat**")
     with st.expander("🏗️ View System Architecture", expanded=False):
         arch_path = Path("src/img/multi_agent_doc_similarity_architecture.svg")
         if arch_path.exists():
             st.image(str(arch_path), use_container_width=True)
     st.markdown("---")
+    # ── Sidebar ───────────────────────────────────────────────────────────────
     with st.sidebar:
         st.header("⚙️ Configuration")
         st.subheader("Phase 2 Features")
         enable_phase2 = st.checkbox(
             "Enable Phase 2 Modalities",
             value=config.ENABLE_IMAGE_COMPARISON,
             help="Enable image, layout, and metadata comparison"
         )
         st.markdown("---")
         st.subheader("Modality Weights")
         if enable_phase2:
+            text_weight   = st.slider("Text Weight",     0.0, 1.0, config.MODALITY_WEIGHTS["text"],     0.05)
+            table_weight  = st.slider("Table Weight",    0.0, 1.0, config.MODALITY_WEIGHTS["table"],    0.05)
+            image_weight  = st.slider("Image Weight",    0.0, 1.0, config.MODALITY_WEIGHTS["image"],    0.05)
+            layout_weight = st.slider("Layout Weight",   0.0, 1.0, config.MODALITY_WEIGHTS["layout"],   0.05)
+            meta_weight   = st.slider("Metadata Weight", 0.0, 1.0, config.MODALITY_WEIGHTS["metadata"], 0.05)
+            total = text_weight + table_weight + image_weight + layout_weight + meta_weight
+            if total > 0:
                 weights = {
+                    "text":     text_weight   / total,
+                    "table":    table_weight  / total,
+                    "image":    image_weight  / total,
+                    "layout":   layout_weight / total,
+                    "metadata": meta_weight   / total,
                 }
             else:
                 weights = config.MODALITY_WEIGHTS
+            st.info("Weights normalised to 1.0")
         else:
+            text_weight  = st.slider("Text Weight", 0.0, 1.0, config.MODALITY_WEIGHTS_PHASE1["text"], 0.05)
             table_weight = 1.0 - text_weight
             st.write(f"Table Weight: {table_weight:.2f}")
             weights = {"text": text_weight, "table": table_weight}
         st.markdown("---")
+        st.subheader("📋 Status")
         st.write("✅ Text comparison")
         st.write("✅ Table comparison")
         if enable_phase2:
+            st.write(f"{'✅' if IMAGE_AGENT_AVAILABLE  else '⚠️'} Image comparison")
             st.write(f"{'✅' if LAYOUT_AGENT_AVAILABLE else '⚠️'} Layout comparison")
+            st.write(f"{'✅' if META_AGENT_AVAILABLE   else '⚠️'} Metadata comparison")
         else:
+            st.write("⏸️ Image / Layout / Metadata (disabled)")
+        st.markdown("---")
+        st.subheader("🔗 Graph RAG Settings")
+        chunk_size    = st.slider("Chunk size (words)",   100, 600, 300, 50)
+        chunk_overlap = st.slider("Overlap (words)",       20, 150,  50, 10)
+        top_k         = st.slider("Vector top-k",           3,  15,   5,  1)
+        graph_hops    = st.slider("Graph hops",             1,   4,   2,  1)
+    # ── Main tabs ─────────────────────────────────────────────────────────────
+    tab1, tab2 = st.tabs(["📊 Document Comparison", "💬 Graph RAG Chat"])
+    # ── Session state init ────────────────────────────────────────────────────
+    for key in ["raw_doc1", "raw_doc2", "rag_state", "rag_pipeline", "chat_history"]:
+        if key not in st.session_state:
+            st.session_state[key] = None if key != "chat_history" else []
+    # ════════════════════════════════════════════════════════════════════════
+    # TAB 1 — Comparison
+    # ════════════════════════════════════════════════════════════════════════
+    with tab1:
+        col1, col2 = st.columns(2)
+        with col1:
+            st.subheader("📤 Document 1 (Main)")
+            uploaded_file1 = st.file_uploader(
+                "Upload PDF or DOCX", type=["pdf", "docx"], key="file1",
+                help="Maximum file size: 50MB"
+            )
+        with col2:
+            st.subheader("📤 Document 2 (Comparison)")
+            uploaded_file2 = st.file_uploader(
+                "Upload PDF or DOCX", type=["pdf", "docx"], key="file2",
+                help="Maximum file size: 50MB"
+            )
+        st.markdown("---")
+        if st.button("🔍 Compare Documents", type="primary", use_container_width=True):
+            if not uploaded_file1 or not uploaded_file2:
+                st.error("Please upload both documents before comparing.")
+            else:
+                with st.spinner("Processing documents..."):
+                    try:
+                        file1_path = save_uploaded_file(uploaded_file1)
+                        file2_path = save_uploaded_file(uploaded_file2)
+                        valid1, error1 = validate_file(file1_path)
+                        valid2, error2 = validate_file(file2_path)
+                        if not valid1:
+                            st.error(f"Document 1 error: {error1}"); st.stop()
+                        if not valid2:
+                            st.error(f"Document 2 error: {error2}"); st.stop()
+                        report, raw_doc1, raw_doc2 = asyncio.run(
+                            process_and_compare(file1_path, file2_path, weights, enable_phase2)
+                        )
+                        # Store raw docs for Graph RAG tab
+                        st.session_state["raw_doc1"] = raw_doc1
+                        st.session_state["raw_doc2"] = raw_doc2
+                        # Reset any previous RAG state
+                        st.session_state["rag_state"] = None
+                        st.session_state["chat_history"] = []
+                        display_results(report)
+                    except Exception as e:
+                        st.error(f"An error occurred: {str(e)}")
+                        import traceback
+                        st.code(traceback.format_exc())
+    # ════════════════════════════════════════════════════════════════════════
+    # TAB 2 — Graph RAG Chat
+    # ════════════════════════════════════════════════════════════════════════
+    with tab2:
+        st.subheader("💬 Chat with your Documents (Graph RAG + Groq)")
+        docs_ready = (
+            st.session_state["raw_doc1"] is not None
+            and st.session_state["raw_doc2"] is not None
         )
+        if not docs_ready:
+            st.info("📂 Please upload and compare documents in the **Document Comparison** tab first.")
+        else:
+            # Load Groq API key from environment (Hugging Face Spaces secrets)
+            groq_key = os.environ.get("GROQ_API_KEY", "")
+            if not groq_key:
+                st.warning("⚠️ GROQ_API_KEY not found in environment. Please set it in Hugging Face Spaces secrets.")
+            col_build, col_reset = st.columns([2, 1])
+            with col_build:
+                build_btn = st.button(
+                    "🔨 Build Graph RAG Index",
+                    disabled=not groq_key,
+                    help="Chunks docs → embeds → builds vector DB + knowledge graph",
+                )
+            with col_reset:
+                if st.button("🔄 Reset Chat"):
+                    st.session_state["chat_history"] = []
+                    if st.session_state["rag_pipeline"]:
+                        st.session_state["rag_pipeline"].reset_chat()
+                    st.rerun()
+            if build_btn:
+                with st.spinner("Chunking, embedding, building knowledge graph — this takes ~30s…"):
+                    pipeline = GraphRAGPipeline(
+                        groq_api_key=groq_key,
+                        chunk_size=chunk_size,
+                        chunk_overlap=chunk_overlap,
+                        top_k_vector=top_k,
+                        graph_hops=graph_hops,
+                    )
+                    rag_state = pipeline.ingest(
+                        st.session_state["raw_doc1"],
+                        st.session_state["raw_doc2"],
+                    )
+                    st.session_state["rag_pipeline"] = pipeline
+                    st.session_state["rag_state"]    = rag_state
+                    st.session_state["chat_history"] = []
+                st.success("✅ Graph RAG index ready!")
+                s = rag_state.stats
+                c1, c2, c3, c4 = st.columns(4)
+                c1.metric("Doc 1 Chunks",  s.get("doc1_chunks", 0))
+                c2.metric("Doc 2 Chunks",  s.get("doc2_chunks", 0))
+                c3.metric("Graph Nodes",   s.get("nodes", 0))
+                c4.metric("Graph Edges",   s.get("edges", 0))
+                with st.expander("Edge type breakdown"):
+                    for etype, cnt in s.get("edge_types", {}).items():
+                        st.write(f"**{etype}**: {cnt}")
+            # ── Chat UI ───────────────────────────────────────────────────────
+            rag_ready = st.session_state["rag_state"] is not None
+            if rag_ready:
+                for msg in st.session_state["chat_history"]:
+                    with st.chat_message(msg["role"]):
+                        st.markdown(msg["content"])
+                if user_input := st.chat_input("Ask anything about the two documents…"):
+                    st.session_state["chat_history"].append(
+                        {"role": "user", "content": user_input}
+                    )
+                    with st.chat_message("user"):
+                        st.markdown(user_input)
+                    with st.chat_message("assistant"):
+                        pipeline: GraphRAGPipeline = st.session_state["rag_pipeline"]
+                        rag_state_obj: PipelineState = st.session_state["rag_state"]
+                        response_gen = pipeline.query(user_input, rag_state_obj, stream=True)
+                        full_response = st.write_stream(response_gen)
+                    st.session_state["chat_history"].append(
+                        {"role": "assistant", "content": full_response}
+                    )
+            else:
+                st.info("👆 Click **Build Graph RAG Index** to start chatting. (Ensure GROQ_API_KEY is set in HF Spaces secrets)")
+# ── Helpers ───────────────────────────────────────────────────────────────────
+async def process_and_compare(file1_path, file2_path, weights, enable_phase2=False):
     ingestion_agent = IngestionAgent()
+    text_agent      = TextAgent()
+    table_agent     = TableAgent()
+    orchestrator    = SimilarityOrchestrator(weights=weights)
+    image_agent  = ImageAgent()  if enable_phase2 and IMAGE_AGENT_AVAILABLE  else None
     layout_agent = LayoutAgent() if enable_phase2 and LAYOUT_AGENT_AVAILABLE else None
+    meta_agent   = MetaAgent()   if enable_phase2 and META_AGENT_AVAILABLE   else None
     progress_bar = st.progress(0)
+    status_text  = st.empty()
     status_text.text("⏳ Ingesting documents...")
     progress_bar.progress(10)
     raw_doc1 = await ingestion_agent.process(file1_path)
     raw_doc2 = await ingestion_agent.process(file2_path)
     progress_bar.progress(15)
+    status_text.text("⏳ Extracting text…")
     text_chunks1, text_embeddings1 = await text_agent.process(raw_doc1)
     text_chunks2, text_embeddings2 = await text_agent.process(raw_doc2)
     progress_bar.progress(30)
+    status_text.text("⏳ Extracting tables…")
     tables1, table_embeddings1 = await table_agent.process(raw_doc1)
     tables2, table_embeddings2 = await table_agent.process(raw_doc2)
     progress_bar.progress(45)
+    images1 = images2 = image_embeddings1 = image_embeddings2 = []
     if image_agent:
+        status_text.text("⏳ Extracting images…")
         try:
             images1, image_embeddings1 = await image_agent.process(raw_doc1)
             images2, image_embeddings2 = await image_agent.process(raw_doc2)
         except Exception as e:
             st.warning(f"Image extraction failed: {e}")
     progress_bar.progress(60)
+    layout1 = layout2 = None
     if layout_agent:
+        status_text.text("⏳ Analysing layout…")
         try:
             layout1 = await layout_agent.process(raw_doc1)
             layout2 = await layout_agent.process(raw_doc2)
         except Exception as e:
             st.warning(f"Layout analysis failed: {e}")
     progress_bar.progress(70)
+    metadata1 = metadata2 = None
     if meta_agent:
+        status_text.text("⏳ Extracting metadata…")
         try:
             metadata1 = await meta_agent.process(raw_doc1)
             metadata2 = await meta_agent.process(raw_doc2)
         except Exception as e:
             st.warning(f"Metadata extraction failed: {e}")
     progress_bar.progress(80)
     processed_doc1 = ProcessedDocument(
+        filename=raw_doc1.filename, text_chunks=text_chunks1, tables=tables1,
+        total_pages=raw_doc1.total_pages, file_type=raw_doc1.file_type,
+        images=images1, layout=layout1, metadata=metadata1
     )
     processed_doc2 = ProcessedDocument(
+        filename=raw_doc2.filename, text_chunks=text_chunks2, tables=tables2,
+        total_pages=raw_doc2.total_pages, file_type=raw_doc2.file_type,
+        images=images2, layout=layout2, metadata=metadata2
     )
+    status_text.text("⏳ Comparing documents…")
     report = await orchestrator.compare_documents(
+        processed_doc1, text_embeddings1, table_embeddings1,
+        processed_doc2, text_embeddings2, table_embeddings2,
+        image_embeddings1, image_embeddings2,
+        layout1, layout2, metadata1, metadata2
     )
     progress_bar.progress(100)
     status_text.text("✅ Comparison complete!")
+    # Return report + raw docs (needed for Graph RAG)
+    return report, raw_doc1, raw_doc2
 def display_results(report):
     st.markdown("---")
     st.header("📊 Comparison Results")
     col1, col2 = st.columns([1, 1])
     with col1:
         gauge_fig = create_similarity_gauge(report.overall_score)
         st.plotly_chart(gauge_fig, use_container_width=True)
     with col2:
         st.markdown(create_score_legend())
     st.markdown("---")
     st.subheader("📈 Per-Modality Breakdown")
     breakdown_fig = create_modality_breakdown_chart(report)
     st.plotly_chart(breakdown_fig, use_container_width=True)
     cols = st.columns(5)
+    scores = [
+        ("Text Similarity",     report.text_score,     "num_matches"),
+        ("Table Similarity",    report.table_score,    "num_matches"),
+        ("Image Similarity",    report.image_score,    "num_matches"),
+        ("Layout Similarity",   report.layout_score,   "num_metrics"),
+        ("Metadata Similarity", report.metadata_score, "num_fields_compared"),
+    ]
+    for col, (label, score_obj, detail_key) in zip(cols, scores):
+        if score_obj:
+            col.metric(label, f"{score_obj.score:.1%}",
+                       f"{score_obj.details.get(detail_key, 0)} items")
     st.markdown("---")
     st.subheader("🔗 Top Matched Sections")
     if report.matched_sections:
+        st.markdown(format_matched_sections(report.matched_sections[:10]))
     else:
+        st.info("No significant matches found.")
     if report.image_score or report.layout_score or report.metadata_score:
         st.markdown("---")
         st.subheader("🎨 Phase 2 Modality Details")
         if report.image_score and report.image_score.matched_items:
+            with st.expander(f"🖼️ Image Matches ({len(report.image_score.matched_items)})"):
+                for idx, m in enumerate(report.image_score.matched_items[:5], 1):
+                    st.markdown(f"**Match {idx}** — {m['similarity']:.2%}")
         if report.layout_score:
+            with st.expander(f"📐 Layout (Score: {report.layout_score.score:.1%})"):
+                for k, v in report.layout_score.details.items():
+                    if k != "num_metrics":
+                        st.metric(k.replace("_", " ").title(), f"{v:.2%}")
         if report.metadata_score and report.metadata_score.matched_items:
+            with st.expander(f"📋 Metadata ({len(report.metadata_score.matched_items)} fields)"):
+                for m in report.metadata_score.matched_items:
+                    st.markdown(f"**{m['field'].title()}** — {m['similarity']:.2%}")
     st.markdown("---")
     report_json = json.dumps(report.model_dump(), indent=2, default=str)
+    st.download_button(
+        "📥 Download Report (JSON)", data=report_json,
+        file_name=f"similarity_report_{report.timestamp.strftime('%Y%m%d_%H%M%S')}.json",
+        mime="application/json"
+    )
 if __name__ == "__main__":

src/utils/__pycache__/visualization.cpython-313.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/visualization.cpython-313.pyc and b/src/utils/__pycache__/visualization.cpython-313.pyc differ