Spaces:

anabaslabs
/

ARC

Running

App Files Files Community

github-actions[bot] commited on 24 days ago

Commit

c1b316f

1 Parent(s): 177b4ac

Deploy from GitHub Actions: 37b5d8a4f4600eb83d09a6e8bb178d0f7e6bc890

Browse files

Files changed (4) hide show

app/config.py +11 -2
app/rag/loader.py +7 -0
app/rag/pipeline.py +24 -0
app/routes/ask.py +6 -1

app/config.py CHANGED Viewed

@@ -38,16 +38,25 @@ ALLOWED_TYPES = {
     "txt",
     "md",
     "json",
 }
 PROMPT = (
     "You are ARC, a helpful document assistant. "
-    "Answer the question based ONLY on the provided context. "
     "If the context contains math or LaTeX, preserve them using $ for inline and $$ for display math. "
-    "If you cannot answer from the context, say so honestly. "
     "Context: {context} Question: {question}"
 )
 CREATORS = [
     {"Krishnendu Das" : "https://itskdhere.com"},
     {"Saptarshi Roy" : "https://hirishi.in"}

     "txt",
     "md",
     "json",
+    "tex",
 }
 PROMPT = (
     "You are ARC, a helpful document assistant. "
+    "Your goal is to provide accurate and helpful answers based on the context provided. "
+    "If the user asks for a summary, synthesize the context into a clear, structured overview. "
     "If the context contains math or LaTeX, preserve them using $ for inline and $$ for display math. "
+    "If you cannot find the answer in the context, say so honestly, but try to be as helpful as possible with the information you have. "
     "Context: {context} Question: {question}"
 )
+SUMMARY_PROMPT = (
+    "Provide a concise yet comprehensive summary of the following document content. "
+    "Focus on the main topics, key points, and overall purpose of the document. "
+    "This summary will be used to help a chatbot understand the document at a high level. "
+    "Content: {content}"
+)
 CREATORS = [
     {"Krishnendu Das" : "https://itskdhere.com"},
     {"Saptarshi Roy" : "https://hirishi.in"}

app/rag/loader.py CHANGED Viewed

@@ -74,3 +74,10 @@ def read_pptx(path: str) -> list[Document]:
     docs = loader.load()
     return docs

     docs = loader.load()
     return docs
+# TEX
+# https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.text.TextLoader
+def read_tex(path: str) -> list[Document]:
+    loader = TextLoader(path, encoding="utf-8")
+    docs = loader.load()
+    return docs

app/rag/pipeline.py CHANGED Viewed

@@ -9,9 +9,12 @@ from app.rag.loader import (
     read_pptx,
     read_txt,
     read_xlsx,
 )
 from app.rag.vectorstore import add_documents
 from langchain_core.documents import Document
 LOADERS = {
     "pdf": read_pdf,
@@ -22,8 +25,10 @@ LOADERS = {
     "docx": read_docx,
     "xlsx": read_xlsx,
     "pptx": read_pptx,
 }
 def _clean_docs(docs: list[Document]) -> list[Document]:
     for doc in docs:
@@ -31,13 +36,32 @@ def _clean_docs(docs: list[Document]) -> list[Document]:
         doc.page_content = process_latex(doc.page_content)
     return docs
 def process_file(path: str, ext: str, session_id: str = "default_index") -> int:
     loader = LOADERS.get(ext.lower())
     if loader is None:
         raise ValueError(f"Unsupported file type: .{ext}")
     docs = loader(path)
     docs = _clean_docs(docs)
     chunks = chunk_docs(docs)
     add_documents(chunks, session_id=session_id)
     return len(chunks)

     read_pptx,
     read_txt,
     read_xlsx,
+    read_tex,
 )
 from app.rag.vectorstore import add_documents
 from langchain_core.documents import Document
+from app.config import CHAT_MODEL, GOOGLE_API_KEY, SUMMARY_PROMPT
+from langchain_google_genai import ChatGoogleGenerativeAI
 LOADERS = {
     "pdf": read_pdf,
     "docx": read_docx,
     "xlsx": read_xlsx,
     "pptx": read_pptx,
+    "tex": read_tex,
 }
+llm = ChatGoogleGenerativeAI(model=CHAT_MODEL, google_api_key=GOOGLE_API_KEY)
 def _clean_docs(docs: list[Document]) -> list[Document]:
     for doc in docs:
         doc.page_content = process_latex(doc.page_content)
     return docs
+def _generate_summary(docs: list[Document]) -> str:
+    full_text = "\n\n".join(doc.page_content for doc in docs[:10])
+    try:
+        response = llm.invoke(SUMMARY_PROMPT.format(content=full_text))
+        return str(response.content)
+    except Exception as e:
+        print(f"Error generating summary: {e}")
+        return ""
 def process_file(path: str, ext: str, session_id: str = "default_index") -> int:
     loader = LOADERS.get(ext.lower())
     if loader is None:
         raise ValueError(f"Unsupported file type: .{ext}")
     docs = loader(path)
     docs = _clean_docs(docs)
+    # summary_text = _generate_summary(docs)
     chunks = chunk_docs(docs)
+    # if summary_text:
+    #     src = docs[0].metadata.get("source", "unknown")
+    #     summary_doc = Document(
+    #         page_content=f"DOCUMENT SUMMARY of {src}: {summary_text}",
+    #         metadata={"source": src, "is_summary": True}
+    #     )
+    #     chunks.insert(0, summary_doc)
     add_documents(chunks, session_id=session_id)
     return len(chunks)

app/routes/ask.py CHANGED Viewed

@@ -24,7 +24,12 @@ class AskResponse(BaseModel):
 async def ask(body: AskRequest, user_id: str = Depends(get_user_id)) -> AskResponse:
     prefixed_session_id = f"{user_id}_{body.session_id}"
     store = get_vectorstore(prefixed_session_id)
-    docs = store.similarity_search(body.question, k=TOP_K)
     if not docs:
         raise HTTPException(400, "No documents found for this session.")

 async def ask(body: AskRequest, user_id: str = Depends(get_user_id)) -> AskResponse:
     prefixed_session_id = f"{user_id}_{body.session_id}"
     store = get_vectorstore(prefixed_session_id)
+    is_summary_request = any(word in body.question.lower() for word in ["summarize", "summary", "overview", "tl;dr"])
+    k = TOP_K * 2 if is_summary_request else TOP_K
+    docs = store.similarity_search(body.question, k=k)
     if not docs:
         raise HTTPException(400, "No documents found for this session.")