SanskarModi commited on
Commit
bb17e33
·
1 Parent(s): 0451125

added document summarizer using langchain

Browse files
backend/app/api/routes_chat_langchain.py DELETED
@@ -1,53 +0,0 @@
1
- """Chat routes using LangChain retriever."""
2
-
3
- from app.config import settings
4
- from app.models.api import ChatRequest, ChatResponse
5
- from app.models.retrieval import ScoredChunk
6
- from app.retrieval.citation_filter import filter_citations
7
- from app.retrieval.langchain_retriever import AtlasGraphRetriever
8
- from fastapi import APIRouter
9
- from langchain.chains import RetrievalQA
10
- from langchain_groq import ChatGroq
11
-
12
- router = APIRouter()
13
-
14
-
15
- @router.post("/ask/langchain", response_model=ChatResponse)
16
- def chat_langchain(request: ChatRequest) -> ChatResponse:
17
- """LangChain-powered RAG endpoint with citation filtering."""
18
- retriever = AtlasGraphRetriever(top_k=request.top_k)
19
-
20
- llm = ChatGroq(
21
- api_key=settings.groq_api_key,
22
- model=settings.default_model,
23
- )
24
-
25
- qa_chain = RetrievalQA.from_chain_type(
26
- llm=llm,
27
- retriever=retriever,
28
- return_source_documents=True,
29
- )
30
-
31
- result = qa_chain.invoke({"query": request.query})
32
-
33
- answer = result["result"]
34
- source_docs = result.get("source_documents", [])
35
-
36
- # Convert LangChain docs → ScoredChunk
37
- scored_chunks = [
38
- ScoredChunk(
39
- chunk=doc.metadata["chunk"],
40
- score=doc.metadata["score"],
41
- )
42
- for doc in source_docs
43
- ]
44
-
45
- citations = filter_citations(
46
- answer=answer,
47
- chunks=scored_chunks,
48
- )
49
-
50
- return ChatResponse(
51
- answer=answer,
52
- citations=citations,
53
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/api/routes_summarize.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Document summarization route (LangChain-based)."""
2
+
3
+ from app.models.api import ChatResponse
4
+ from app.retrieval.chunk_registry import get_chunks
5
+ from app.summarization.langchain_summarizer import DocumentSummarizer
6
+ from fastapi import APIRouter, HTTPException
7
+
8
+ router = APIRouter()
9
+ summarizer = DocumentSummarizer()
10
+
11
+
12
+ @router.post("/langchain", response_model=ChatResponse)
13
+ def summarize_document() -> ChatResponse:
14
+ """Summarize all ingested documents.
15
+
16
+ Note:
17
+ - This is recall-heavy by design
18
+ - No citations (summary ≠ factual QA)
19
+ """
20
+ chunks = get_chunks()
21
+
22
+ if not chunks:
23
+ raise HTTPException(
24
+ status_code=400,
25
+ detail="No documents available for summarization.",
26
+ )
27
+
28
+ summary = summarizer.summarize(chunks)
29
+
30
+ return ChatResponse(
31
+ answer=summary,
32
+ citations=[],
33
+ )
backend/app/main.py CHANGED
@@ -1,8 +1,8 @@
1
  """Main FastAPI application for AtlasRAG backend."""
2
 
3
  from app.api.routes_chat import router as chat_router
4
- from app.api.routes_chat_langchain import router as chat_langchain_router
5
  from app.api.routes_docs import router as docs_router
 
6
  from fastapi import FastAPI
7
  from fastapi.middleware.cors import CORSMiddleware
8
 
@@ -12,7 +12,7 @@ app = FastAPI(
12
  description="Backend API for AtlasRAG multi-document research assistant.",
13
  )
14
 
15
- # CORS enabled for all origins (safe during development)
16
  app.add_middleware(
17
  CORSMiddleware,
18
  allow_origins=["*"],
@@ -24,4 +24,4 @@ app.add_middleware(
24
  # Include routers
25
  app.include_router(chat_router, prefix="/chat")
26
  app.include_router(docs_router, prefix="/docs")
27
- app.include_router(chat_langchain_router, prefix="/chat")
 
1
  """Main FastAPI application for AtlasRAG backend."""
2
 
3
  from app.api.routes_chat import router as chat_router
 
4
  from app.api.routes_docs import router as docs_router
5
+ from app.api.routes_summarize import router as summarize_langchain_router
6
  from fastapi import FastAPI
7
  from fastapi.middleware.cors import CORSMiddleware
8
 
 
12
  description="Backend API for AtlasRAG multi-document research assistant.",
13
  )
14
 
15
+ # CORS enabled for all origins
16
  app.add_middleware(
17
  CORSMiddleware,
18
  allow_origins=["*"],
 
24
  # Include routers
25
  app.include_router(chat_router, prefix="/chat")
26
  app.include_router(docs_router, prefix="/docs")
27
+ app.include_router(summarize_langchain_router, prefix="/summarize")
backend/app/retrieval/langchain_retriever.py DELETED
@@ -1,35 +0,0 @@
1
- """LangChain retriever wrapper for AtlasRAG."""
2
-
3
- from typing import List
4
-
5
- from app.retrieval.retrieve import hybrid_graph_search
6
- from langchain_core.documents import Document
7
- from langchain_core.retrievers import BaseRetriever
8
-
9
-
10
- class AtlasGraphRetriever(BaseRetriever):
11
- """LangChain-compatible retriever wrapping hybrid Graph-RAG."""
12
-
13
- top_k: int = 5
14
-
15
- def _get_relevant_documents(self, query: str) -> List[Document]:
16
- """Retrieve documents for LangChain."""
17
- results = hybrid_graph_search(query, self.top_k)
18
-
19
- documents: List[Document] = []
20
-
21
- for sc in results:
22
- documents.append(
23
- Document(
24
- page_content=sc.chunk.text,
25
- metadata={
26
- "doc_id": sc.chunk.doc_id,
27
- "page_start": sc.chunk.page_start,
28
- "page_end": sc.chunk.page_end,
29
- "chunk": sc.chunk,
30
- "score": sc.score,
31
- },
32
- )
33
- )
34
-
35
- return documents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/summarization/__init__.py ADDED
File without changes
backend/app/summarization/langchain_summarizer.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LangChain-based document summarization using a local HF model."""
2
+
3
+ from typing import List
4
+
5
+ from app.models.ingestion import Chunk
6
+ from langchain.chains.summarize import load_summarize_chain
7
+ from langchain.docstore.document import Document
8
+ from langchain.llms import HuggingFacePipeline
9
+ from transformers import pipeline
10
+
11
+
12
+ class DocumentSummarizer:
13
+ """Document summarizer using LangChain + local HF model."""
14
+
15
+ def __init__(self) -> None:
16
+ """Initialize HF Pipeline."""
17
+ summarizer = pipeline(
18
+ "summarization",
19
+ model="facebook/bart-large-cnn",
20
+ device=-1,
21
+ )
22
+
23
+ self.llm = HuggingFacePipeline(pipeline=summarizer)
24
+
25
+ self.chain = load_summarize_chain(
26
+ llm=self.llm,
27
+ chain_type="map_reduce",
28
+ verbose=False,
29
+ )
30
+
31
+ def summarize(self, chunks: List[Chunk]) -> str:
32
+ """Summarize document chunks."""
33
+ if not chunks:
34
+ return "No content available to summarize."
35
+
36
+ documents = [
37
+ Document(
38
+ page_content=chunk.text,
39
+ metadata={
40
+ "doc_id": chunk.doc_id,
41
+ "page_start": chunk.page_start,
42
+ "page_end": chunk.page_end,
43
+ },
44
+ )
45
+ for chunk in chunks
46
+ ]
47
+
48
+ return self.chain.run(documents)
requirements.txt CHANGED
@@ -22,6 +22,7 @@ pymupdf==1.24.7
22
  spacy==3.7.4
23
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
24
  sentence-transformers==2.6.1
 
25
  rank-bm25==0.2.2
26
  whoosh==2.7.4
27
 
 
22
  spacy==3.7.4
23
  https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
24
  sentence-transformers==2.6.1
25
+ accelerate==1.12.0
26
  rank-bm25==0.2.2
27
  whoosh==2.7.4
28