Spaces:

saashley
/

capitolati-rag

Sleeping

App Files Files Community

Ashley Andrea Squarcio commited on May 21, 2025

Commit

2fc692a

1 Parent(s): 486389e

Initial import: code, dependencies, chunk_index.pkl (LFS tracked)

Browse files

Files changed (7) hide show

app.py +87 -0
chunk_index.pkl +3 -0
dspy_wrapper.py +71 -0
main.py +34 -0
neo4j_config.py +111 -0
requirements.txt +156 -0
retriever.py +222 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from main import rag_pipeline
+import gradio as gr
+import html
+municipalities = [
+    "Comun General de Fascia",
+    "Comune di Capo D'Orlando",
+    "Comune di Casatenovo",
+    "Comune di Fonte Nuova",
+    "Comune di Gubbio",
+    "Comune di Torre Santa Susanna",
+    "Comune di Santa Maria Capua Vetere"
+]
+def answer_fn(query: str, municipality: str):
+    # "All" or empty count as no filter
+    filters = {}
+    if municipality and municipality != "All":
+        filters["municipality"] = municipality
+    output = rag_pipeline(query=query, municipality=municipality)
+    final_answer = output["final_answer"]
+    cot = output["chain_of_thought"]
+    chunks = output["retrieved_chunks"]
+    html_blocks = []
+    for i, c in enumerate(chunks, 1):
+        text = html.escape(c["chunk_text"])
+        meta = {
+            "Document ID": c.get("document_id", "N/A"),
+            "Municipality": c.get("municipality", "N/A"),
+            "Section": c.get("section", "N/A"),
+            "Page": c.get("page", "N/A"),
+            "Score": f"{c.get('final_score', 0):.4f}"
+        }
+        # transforms metadata dictionary into a series of <li> (list) items
+        meta_lines = "".join(f"<li><b>{k}</b>: {v}</li>" for k, v in meta.items())
+        # displays chunk number as header, metadata and text
+        block = f"""
+        <div style="margin-bottom:1em;">
+          <h4>Chunk {i}</h4>
+          <ul>{meta_lines}</ul>
+          <p style="white-space: pre-wrap;">{text}</p>
+        </div>
+        <hr/>
+        """
+        html_blocks.append(block)
+    chunks_html = "\n".join(html_blocks) or "<i>No chunks retrieved.</i>"
+    return final_answer, cot, chunks_html
+with gr.Blocks() as demo:
+    gr.Markdown("## DSPy RAG Demo")
+    with gr.Row():
+        query_input = gr.Textbox(label="Question", placeholder="Type your query here…")
+        muni_input = gr.Dropdown(
+            choices=["All"] + municipalities,
+            value="All",
+            label="Municipality (optional)"
+        )
+    run_btn = gr.Button("Get Answer")
+    ans_out = gr.Textbox(label="Final Answer", lines=3)  # answer container
+    # CoT container inside its accordion
+    with gr.Accordion("Chain of Thought Reasoning", open=False):
+        cot_txt = gr.Textbox(label="", interactive=False, lines=6)
+    # chunks HTML container inside its accordion
+    with gr.Accordion("Retrieved Chunks with Metadata", open=False):
+        chunks_html = gr.HTML("<i>No data yet.</i>")
+    # Wire the button click to the function (with outputs matching the order of returned values)
+    run_btn.click(
+        fn=answer_fn,
+        inputs=[query_input, muni_input],
+        outputs=[ans_out, cot_txt, chunks_html]
+    )
+demo.launch(share=True)

chunk_index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5962e522e6557c69c68ba22fc4d70487fc34b83cc6875be11b5e3564b3a38de1
+size 17549445

dspy_wrapper.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import dspy
+from typing import List, Dict
+import os
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    raise RuntimeError("Missing OPENAI_API_KEY env var")
+gpt_4o_mini = dspy.LM('openai/gpt-4o-mini', api_key=OPENAI_API_KEY)
+# using unimib credentials, switch to PeS if needed!
+dspy.configure(lm=gpt_4o_mini)
+# == Building Blocks ==
+class DSPyHybridRetriever(dspy.Module):
+    def __init__(self, retriever):
+        super().__init__()
+        self.retriever = retriever
+    def forward(self, query: str, municipality: str = "", top_k: int = 5):
+        results = self.retriever.retrieve(query, top_k=top_k, municipality_filter=municipality) # remember to change to rerank
+        return {"retrieved_chunks": results}
+class RetrieveChunks(dspy.Signature):
+    """Given a user query and optional municipality, retrieve relevant text chunks."""
+    query = dspy.InputField(desc="User's question")
+    municipality = dspy.InputField(desc="Optional municipality filter")
+    retrieved_chunks = dspy.OutputField(
+        desc=(
+            "List of retrieved chunks, each as a dict with keys: "
+            "`chunk`, `document_id`, `section`, `level`, `page`, "
+            "`dense_score`, `sparse_score`, `graph_score`, `final_score`"
+        ),
+        type=List[Dict]  # each item is a dict carrying all those fields
+    )
+class AnswerWithEvidence(dspy.Signature):
+    """Answer the query using reasoning and retrieved chunks as context."""
+    query = dspy.InputField(desc="User's question")
+    retrieved_chunks = dspy.InputField(desc="Retrieved text chunks (List[dict])")
+    answer = dspy.OutputField(desc="Final answer")
+    rationale = dspy.OutputField(desc="Chain-of-thought reasoning")
+# == RAG Pipeline ==
+class RAGChain(dspy.Module):
+    def __init__(self, retriever, answerer):
+        super().__init__()
+        self.retriever = retriever
+        self.answerer = answerer
+    def forward(self, query: str, municipality: str = ""):
+        # retrieve full dicts
+        retrieved = self.retriever(query=query, municipality=municipality)
+        chunks = retrieved["retrieved_chunks"]
+        # feed only the raw text into the CoT module
+        answer_result = self.answerer(
+            query=query,
+            retrieved_chunks=[c["chunk_text"] for c in chunks]
+        )
+        # return both the metadata and the LLM answer
+        return {
+            "query": query,
+            "municipality": municipality,
+            "retrieved_chunks": chunks,
+            "chain_of_thought": answer_result.rationale,
+            "final_answer": answer_result.answer,
+        }

main.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from neo4j_config import URI, USER, PASSWORD, AUTH
+from retriever import *
+from dspy_wrapper import *
+from neo4j import GraphDatabase
+import os, pickle
+# == Fast Load of Precomputed Index ==
+HERE = os.path.dirname(__file__)
+with open(os.path.join(HERE, "chunk_index.pkl"), "rb") as f:
+    all_chunks = pickle.load(f)  # already contain embeddings and ids
+# == Neo4j Setup ==
+with GraphDatabase.driver(URI, auth=AUTH) as driver:
+    driver.verify_connectivity()
+# == Retrieval ==
+retriever = HybridRetriever(all_chunks)
+reranker = GraphReranker(
+    retriever,
+    neo4j_uri=URI,
+    neo4j_user=USER,
+    neo4j_pass=PASSWORD,
+    beta=0.2,
+    max_hops=3
+)
+# == Pipeline Initialization ==
+retriever_module = DSPyHybridRetriever(retriever)
+cot_module = dspy.ChainOfThought(AnswerWithEvidence)
+rag_pipeline = RAGChain(retriever=retriever_module, answerer=cot_module)

neo4j_config.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import re
+from neo4j import GraphDatabase
+URI = "neo4j+s://1ea442ce.databases.neo4j.io"
+USER = "neo4j"
+PASSWORD = "diGxvEhJnqcp18rHDwPzGv1KaRvxprUvdD1h31unwa8"
+AUTH = (USER, PASSWORD)
+with GraphDatabase.driver(URI, auth=AUTH) as driver:
+    driver.verify_connectivity()
+def normalize_int(value, default=0):
+    """
+    Safely convert value to int.
+    - If already int, return it.
+    - If str of digits, parse it.
+    - Otherwise return `default`.
+    """
+    if isinstance(value, int):
+        return value
+    if isinstance(value, str) and value.isdigit():
+        return int(value)
+    # optionally, extract digits from strings like "1.":
+    m = re.match(r"(\d+)", str(value))
+    if m:
+        return int(m.group(1))
+    return default
+def add_municipality(tx, municipality):
+    tx.run("""
+        MERGE (m:Municipality {name: $municipality})
+    """, municipality=municipality)
+def add_document(tx, doc_id, municipality):
+    tx.run("""
+        MATCH (m:Municipality {name: $municipality})
+        MERGE (d:Document {doc_id: $doc_id})
+        MERGE (m)-[:HAS_DOCUMENT]->(d)
+    """, municipality=municipality, doc_id=doc_id)
+def add_chunk(tx, chunk):
+    tx.run("""
+        MATCH (d:Document {doc_id: $doc_id})
+        MERGE (c:Chunk {id: $id})
+        SET c.page = $page,
+            c.section = $section,
+            c.level = $level,
+            c.text = $text,
+            c.embedding = $embedding
+        MERGE (d)-[:HAS_CHUNK]->(c)
+    """, id=chunk["id"], doc_id=chunk["document_id"],
+         page=chunk["page"], section=chunk["section"],
+         level=chunk["level"], text=chunk["chunk_text"],
+         embedding=chunk["embedding"])
+def link_parent(tx, parent_id, child_id):
+    tx.run("""
+        MATCH (p:Chunk {id: $parent_id}), (c:Chunk {id: $child_id})
+        MERGE (p)-[:HAS_SUBSECTION]->(c)
+    """, parent_id=parent_id, child_id=child_id)
+def link_sibling(tx, sibling1_id, sibling2_id):
+    tx.run("""
+        MATCH (c1:Chunk {id: $sibling1_id}), (c2:Chunk {id: $sibling2_id})
+        MERGE (c1)-[:NEXT_TO]->(c2)
+    """, sibling1_id=sibling1_id, sibling2_id=sibling2_id)
+# takes again quite some time to compute, we could re-download a pkl file with ids as well
+def sync_chunk_ids(all_chunks, driver, prefix_len=50):
+    """
+    For each chunk in-memory, look up its real DB id by matching on:
+      - page
+      - section
+      - the first `prefix_len` chars of text
+    If already present, overwrites chunk["id"] with the DB value when found,
+    otherwise retrieves the id from the graph db and adds it to each chunk's dict.
+    """
+    with driver.session() as session:
+        for chunk in all_chunks:
+            # build prefix of the chunk text
+            prefix = chunk["chunk_text"][:prefix_len]
+            # normalize numeric props
+            page = normalize_int(chunk.get("page"))
+            cypher = """
+            MATCH (c:Chunk {
+              page: $page,
+              section: $section
+            })
+            WHERE c.text STARTS WITH $prefix
+            RETURN c.id AS real_id
+            LIMIT 1
+            """
+            params = {
+                "page": page,
+                "section": chunk["section"],
+                "prefix": prefix
+            }
+            rec = session.run(cypher, params).single()
+            if rec:
+                chunk["id"] = rec["real_id"]
+            else:
+                print(f"No DB match for chunk: page={page} "
+                      f"section={chunk.get('section')!r} prefix={prefix!r}")
+## CHUNK INGESTION CODE NOT PRESENT HERE!! CHECK COLAB NB!

requirements.txt ADDED Viewed

	@@ -0,0 +1,156 @@

+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.18
+aiosignal==1.3.2
+alembic==1.15.2
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.9.0
+appnope==0.1.4
+asttokens==3.0.0
+asyncer==0.0.8
+attrs==25.3.0
+backoff==2.2.1
+blinker==1.9.0
+cachetools==5.5.2
+certifi==2025.4.26
+cffi==1.17.1
+charset-normalizer==3.4.2
+click==8.1.8
+cloudpickle==3.1.1
+colorlog==6.9.0
+comm==0.2.2
+cryptography==44.0.3
+datasets==3.5.1
+debugpy==1.8.14
+decorator==5.2.1
+dill==0.3.8
+diskcache==5.6.3
+distro==1.9.0
+dspy==2.6.14
+dspy-ai==2.6.23
+executing==2.2.0
+faiss-cpu==1.11.0
+fastapi==0.115.12
+ffmpy==0.5.0
+filelock==3.18.0
+frozenlist==1.6.0
+fsspec==2025.3.0
+gitdb==4.0.12
+GitPython==3.1.44
+gradio==5.30.0
+gradio_client==1.10.1
+groovy==0.1.2
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.30.2
+idna==3.10
+importlib_metadata==8.7.0
+ipykernel==6.29.5
+ipython>=8,<9
+ipython_pygments_lexers==1.1.1
+jedi==0.19.2
+Jinja2==3.1.6
+jiter==0.9.0
+joblib==1.5.0
+json_repair==0.44.1
+jsonschema==4.23.0
+jsonschema-specifications==2025.4.1
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+litellm==1.63.7
+magicattr==0.1.6
+Mako==1.3.10
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.4.3
+multiprocess==0.70.16
+narwhals==1.38.0
+neo4j==5.28.1
+nest-asyncio==1.6.0
+networkx==3.4.2
+numpy==2.2.5
+openai==1.61.0
+optuna==4.3.0
+orjson==3.10.18
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pdfminer.six==20250327
+pdfplumber==0.11.6
+pexpect==4.9.0
+pillow==11.2.1
+platformdirs==4.3.7
+prompt_toolkit==3.0.51
+propcache==0.3.1
+protobuf==6.30.2
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==20.0.0
+pycparser==2.22
+pydantic==2.11.4
+pydantic_core==2.33.2
+pydeck==0.9.1
+pydub==0.25.1
+Pygments==2.19.1
+PyMuPDF==1.25.5
+PyPDF2==3.0.1
+pypdfium2==4.30.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+pyzmq==26.4.0
+RapidFuzz==3.13.0
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rich==14.0.0
+rpds-py==0.24.0
+ruff==0.11.10
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+semantic-version==2.10.0
+sentence-transformers==4.1.0
+setuptools==80.3.1
+shellingham==1.5.4
+six==1.17.0
+sklearn-preprocessing==0.1.0
+smmap==5.0.2
+sniffio==1.3.1
+SQLAlchemy==2.0.40
+stack-data==0.6.3
+starlette==0.46.2
+streamlit==1.45.0
+sympy==1.14.0
+tenacity==9.1.2
+threadpoolctl==3.6.0
+tiktoken==0.9.0
+tokenizers==0.21.1
+toml==0.10.2
+tomlkit==0.13.2
+torch==2.7.0
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.51.3
+typer==0.15.4
+typing-inspection==0.4.0
+typing_extensions==4.13.2
+tzdata==2025.2
+ujson==5.10.0
+urllib3==2.4.0
+uvicorn==0.34.2
+wcwidth==0.2.13
+websockets==15.0.1
+xxhash==3.5.0
+yarl==1.20.0
+zipp==3.21.0

retriever.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import faiss
+import re
+import numpy as np
+from typing import List, Dict
+from sklearn.preprocessing import normalize
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from rapidfuzz import fuzz
+from neo4j import GraphDatabase
+embedding_model = SentenceTransformer('nickprock/multi-sentence-BERTino')
+embedding_dim = embedding_model.get_sentence_embedding_dimension()
+# == Base Hybrid Retriever (Dense + Sparse) ==
+class HybridRetriever:
+    def __init__(self, chunks: List[Dict]):
+        self.raw_chunks = chunks
+        # load precomputed embeddings (with bertino)
+        print("Loading dense embeddings from chunks...")
+        dense_embeddings = np.vstack([
+            chunk["embedding"] for chunk in chunks
+        ])
+        self.embeddings = normalize(dense_embeddings, axis=1, norm='l2') # l2 normalization for cosine similarity
+        print("Fitting FAISS index...")
+        self.index = faiss.IndexFlatIP(self.embeddings.shape[1])
+        self.index.add(self.embeddings)
+        print("Fitting TF-IDF vectorizer...")
+        self.texts = [chunk["chunk_text"] for chunk in chunks]
+        self.vectorizer = TfidfVectorizer(stop_words=None)
+        self.sparse_matrix = self.vectorizer.fit_transform(self.texts)
+        # this is just a temporary hard-coded fix for correctly matching this municipality in the UI retrieval,
+        # since ' is still not recognized in municipality extraction
+        self._overrides = {
+            "capo d orlando": "capo d"
+        }
+    def fuzzy_match(self, municipality_ref, user_input, threshold=80):
+        """
+        Fuzzy-matches stored municipality against user input.
+        Uses rapidfuzz for accurate and fast matching.
+        """
+        # Normalize both strings
+        def normalize(text):
+            text = text.lower()
+            # remove common prefixes
+            for prefix in ("comune di ", "comuni di ", "città di ", "citta di "):
+                if text.startswith(prefix):
+                    text = text[len(prefix):]
+            # strip out non-alphanumeric characters
+            text = re.sub(r"[^a-z0-9]+", " ", text)
+            return re.sub(r"\s+", " ", text).strip()
+        ref = normalize(municipality_ref)
+        inp = normalize(user_input)
+        # override check: if the normalized input matches a key, force match
+        if inp in self._overrides:
+            # normalize the override target and compare to this ref
+            override_target = normalize(self._overrides[inp])
+            if ref == override_target:
+                return True
+        # Compute fuzzy match score
+        score = fuzz.ratio(ref, inp)
+        return score >= threshold
+    def retrieve(self, query, top_k=5, municipality_filter=None, alpha=0.8, threshold=0.3):
+        print("Starting hybrid retrieval...")
+        query_embedding = embedding_model.encode([query], convert_to_numpy=True)
+        query_embedding = normalize(query_embedding, axis=1, norm='l2')  # query also needs l2 normalization
+        query_tfidf = self.vectorizer.transform([query])
+        sparse_sim = cosine_similarity(query_tfidf, self.sparse_matrix).flatten()
+        D, I = self.index.search(query_embedding, top_k * 5)
+        results = []
+        seen = set()
+        for rank, idx in enumerate(I[0]):
+            if idx in seen:
+                continue
+            seen.add(idx)
+            # Enforce municipality constraint
+            if municipality_filter:
+                stored = self.raw_chunks[idx].get("municipality", "")
+                if not self.fuzzy_match(stored, municipality_filter):
+                    continue
+            dense_score = D[0][rank]
+            sparse_score = sparse_sim[idx]
+            hybrid_score = alpha * dense_score + (1 - alpha) * sparse_score
+            if hybrid_score < threshold:
+                continue
+            results.append({
+                **self.raw_chunks[idx], # appends all chunk properties to the output (including id and embedding!)
+                "dense_score": dense_score,
+                "sparse_score": sparse_score,
+                "hybrid_score": hybrid_score
+            })
+            if len(results) >= top_k:
+                break
+        # Fallback if nothing survived the threshold/filtering
+        if not results:
+            print(f"No results above threshold for '{municipality_filter}'. Falling back to top-{top_k}.")
+            results = []
+            for rank, idx in enumerate(I[0][:top_k]):
+                dense_score = D[0][rank]
+                sparse_score = sparse_sim[idx]
+                hybrid_score = alpha * dense_score + (1 - alpha) * sparse_score
+                results.append({
+                    **self.raw_chunks[idx],
+                    "dense_score": dense_score,
+                    "sparse_score": sparse_score,
+                    "hybrid_score": hybrid_score
+                })
+        # Return sorted top-k
+        top_results = sorted(results, key=lambda x: x["hybrid_score"], reverse=True)[:top_k]
+        return top_results
+# == Graph Reranker ==
+class GraphReranker:
+    def __init__(
+        self,
+        base_retriever,
+        neo4j_uri: str,
+        neo4j_user: str,
+        neo4j_pass: str,
+        beta: float = 0.2,
+        max_hops: int = 3
+    ):
+        """
+        base_retriever: instance of HybridRetriever
+        beta: weight for the graph component
+        max_hops: how many hops we'll search in shortestPath()
+        """
+        self.base = base_retriever
+        self.beta = beta
+        self.max_hops = max_hops
+        self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_pass))
+    def graph_score(self, candidate_id: str, seed_ids: list[str]) -> float:
+        """
+        Uses built-in shortestPath() over HAS_SUBSECTION|NEXT_TO
+        with a literal max_hops in the relationship pattern.
+        """
+        rel_pat = f"[:HAS_SUBSECTION|NEXT_TO*..{self.max_hops}]"
+        query = f"""
+        MATCH p = shortestPath(
+          (seed:Chunk {{id: $seed_id}})-{rel_pat}-(cand:Chunk {{id: $cand_id}})
+        )
+        RETURN length(p) AS hops
+        """
+        min_hops = None
+        with self.driver.session() as sess:
+            for sid in seed_ids:
+                # Skip self, don’t count seed==candidate
+                if sid == candidate_id:
+                    continue
+                rec = sess.run(
+                    query,
+                    {"seed_id": sid, "cand_id": candidate_id}
+                ).single()
+                if rec and rec["hops"] is not None:
+                    h = rec["hops"]
+                    if min_hops is None or h < min_hops:
+                        min_hops = h
+        # If no path found to any OTHER seed, score is 0
+        return 0.0 if min_hops is None else 1.0 / (1 + min_hops)
+    def rerank(
+        self,
+        query: str,
+        top_k: int = 5,
+        municipality_filter: str | None = None,
+        alpha: float = 0.8,
+        threshold: float = 0.3
+    ) -> list[dict]:
+        """
+        1. Pull a broader set of text-only candidates
+        2. Compute graph_score for each
+        3. Blend and return the top_k final results
+        """
+        raw_cands = self.base.retrieve(
+            query,
+            top_k=top_k * 5, # more candidates than top_k to give material to the graph
+            municipality_filter=municipality_filter,
+            alpha=alpha,
+            threshold=threshold
+        )
+        # extract seed IDs from the first top_k (best text‐only hits)
+        seed_ids = [c["id"] for c in raw_cands[:top_k]]
+        # compute graph_score for each candidate
+        enriched = []
+        for cand in raw_cands:
+            g = self.graph_score(cand["id"], seed_ids)
+            basic = cand.get("hybrid_score",
+                              alpha * cand["dense_score"] + (1-alpha) * cand["sparse_score"])
+            final = basic + self.beta * g
+            enriched.append({ **cand,
+                              "graph_score": g,
+                              "final_score": final })
+        return sorted(enriched, key=lambda x: x["final_score"], reverse=True)[:top_k]