Spaces:
Running
Running
github-actions[bot] commited on
Commit ·
c1b316f
1
Parent(s): 177b4ac
Deploy from GitHub Actions: 37b5d8a4f4600eb83d09a6e8bb178d0f7e6bc890
Browse files- app/config.py +11 -2
- app/rag/loader.py +7 -0
- app/rag/pipeline.py +24 -0
- app/routes/ask.py +6 -1
app/config.py
CHANGED
|
@@ -38,16 +38,25 @@ ALLOWED_TYPES = {
|
|
| 38 |
"txt",
|
| 39 |
"md",
|
| 40 |
"json",
|
|
|
|
| 41 |
}
|
| 42 |
|
| 43 |
PROMPT = (
|
| 44 |
"You are ARC, a helpful document assistant. "
|
| 45 |
-
"
|
|
|
|
| 46 |
"If the context contains math or LaTeX, preserve them using $ for inline and $$ for display math. "
|
| 47 |
-
"If you cannot answer
|
| 48 |
"Context: {context} Question: {question}"
|
| 49 |
)
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
CREATORS = [
|
| 52 |
{"Krishnendu Das" : "https://itskdhere.com"},
|
| 53 |
{"Saptarshi Roy" : "https://hirishi.in"}
|
|
|
|
| 38 |
"txt",
|
| 39 |
"md",
|
| 40 |
"json",
|
| 41 |
+
"tex",
|
| 42 |
}
|
| 43 |
|
| 44 |
PROMPT = (
|
| 45 |
"You are ARC, a helpful document assistant. "
|
| 46 |
+
"Your goal is to provide accurate and helpful answers based on the context provided. "
|
| 47 |
+
"If the user asks for a summary, synthesize the context into a clear, structured overview. "
|
| 48 |
"If the context contains math or LaTeX, preserve them using $ for inline and $$ for display math. "
|
| 49 |
+
"If you cannot find the answer in the context, say so honestly, but try to be as helpful as possible with the information you have. "
|
| 50 |
"Context: {context} Question: {question}"
|
| 51 |
)
|
| 52 |
|
| 53 |
+
SUMMARY_PROMPT = (
|
| 54 |
+
"Provide a concise yet comprehensive summary of the following document content. "
|
| 55 |
+
"Focus on the main topics, key points, and overall purpose of the document. "
|
| 56 |
+
"This summary will be used to help a chatbot understand the document at a high level. "
|
| 57 |
+
"Content: {content}"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
CREATORS = [
|
| 61 |
{"Krishnendu Das" : "https://itskdhere.com"},
|
| 62 |
{"Saptarshi Roy" : "https://hirishi.in"}
|
app/rag/loader.py
CHANGED
|
@@ -74,3 +74,10 @@ def read_pptx(path: str) -> list[Document]:
|
|
| 74 |
docs = loader.load()
|
| 75 |
return docs
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
docs = loader.load()
|
| 75 |
return docs
|
| 76 |
|
| 77 |
+
|
| 78 |
+
# TEX
|
| 79 |
+
# https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.text.TextLoader
|
| 80 |
+
def read_tex(path: str) -> list[Document]:
|
| 81 |
+
loader = TextLoader(path, encoding="utf-8")
|
| 82 |
+
docs = loader.load()
|
| 83 |
+
return docs
|
app/rag/pipeline.py
CHANGED
|
@@ -9,9 +9,12 @@ from app.rag.loader import (
|
|
| 9 |
read_pptx,
|
| 10 |
read_txt,
|
| 11 |
read_xlsx,
|
|
|
|
| 12 |
)
|
| 13 |
from app.rag.vectorstore import add_documents
|
| 14 |
from langchain_core.documents import Document
|
|
|
|
|
|
|
| 15 |
|
| 16 |
LOADERS = {
|
| 17 |
"pdf": read_pdf,
|
|
@@ -22,8 +25,10 @@ LOADERS = {
|
|
| 22 |
"docx": read_docx,
|
| 23 |
"xlsx": read_xlsx,
|
| 24 |
"pptx": read_pptx,
|
|
|
|
| 25 |
}
|
| 26 |
|
|
|
|
| 27 |
|
| 28 |
def _clean_docs(docs: list[Document]) -> list[Document]:
|
| 29 |
for doc in docs:
|
|
@@ -31,13 +36,32 @@ def _clean_docs(docs: list[Document]) -> list[Document]:
|
|
| 31 |
doc.page_content = process_latex(doc.page_content)
|
| 32 |
return docs
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
def process_file(path: str, ext: str, session_id: str = "default_index") -> int:
|
| 36 |
loader = LOADERS.get(ext.lower())
|
| 37 |
if loader is None:
|
| 38 |
raise ValueError(f"Unsupported file type: .{ext}")
|
|
|
|
| 39 |
docs = loader(path)
|
| 40 |
docs = _clean_docs(docs)
|
|
|
|
| 41 |
chunks = chunk_docs(docs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
add_documents(chunks, session_id=session_id)
|
| 43 |
return len(chunks)
|
|
|
|
| 9 |
read_pptx,
|
| 10 |
read_txt,
|
| 11 |
read_xlsx,
|
| 12 |
+
read_tex,
|
| 13 |
)
|
| 14 |
from app.rag.vectorstore import add_documents
|
| 15 |
from langchain_core.documents import Document
|
| 16 |
+
from app.config import CHAT_MODEL, GOOGLE_API_KEY, SUMMARY_PROMPT
|
| 17 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 18 |
|
| 19 |
LOADERS = {
|
| 20 |
"pdf": read_pdf,
|
|
|
|
| 25 |
"docx": read_docx,
|
| 26 |
"xlsx": read_xlsx,
|
| 27 |
"pptx": read_pptx,
|
| 28 |
+
"tex": read_tex,
|
| 29 |
}
|
| 30 |
|
| 31 |
+
llm = ChatGoogleGenerativeAI(model=CHAT_MODEL, google_api_key=GOOGLE_API_KEY)
|
| 32 |
|
| 33 |
def _clean_docs(docs: list[Document]) -> list[Document]:
|
| 34 |
for doc in docs:
|
|
|
|
| 36 |
doc.page_content = process_latex(doc.page_content)
|
| 37 |
return docs
|
| 38 |
|
| 39 |
+
def _generate_summary(docs: list[Document]) -> str:
|
| 40 |
+
full_text = "\n\n".join(doc.page_content for doc in docs[:10])
|
| 41 |
+
try:
|
| 42 |
+
response = llm.invoke(SUMMARY_PROMPT.format(content=full_text))
|
| 43 |
+
return str(response.content)
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"Error generating summary: {e}")
|
| 46 |
+
return ""
|
| 47 |
|
| 48 |
def process_file(path: str, ext: str, session_id: str = "default_index") -> int:
|
| 49 |
loader = LOADERS.get(ext.lower())
|
| 50 |
if loader is None:
|
| 51 |
raise ValueError(f"Unsupported file type: .{ext}")
|
| 52 |
+
|
| 53 |
docs = loader(path)
|
| 54 |
docs = _clean_docs(docs)
|
| 55 |
+
# summary_text = _generate_summary(docs)
|
| 56 |
chunks = chunk_docs(docs)
|
| 57 |
+
|
| 58 |
+
# if summary_text:
|
| 59 |
+
# src = docs[0].metadata.get("source", "unknown")
|
| 60 |
+
# summary_doc = Document(
|
| 61 |
+
# page_content=f"DOCUMENT SUMMARY of {src}: {summary_text}",
|
| 62 |
+
# metadata={"source": src, "is_summary": True}
|
| 63 |
+
# )
|
| 64 |
+
# chunks.insert(0, summary_doc)
|
| 65 |
+
|
| 66 |
add_documents(chunks, session_id=session_id)
|
| 67 |
return len(chunks)
|
app/routes/ask.py
CHANGED
|
@@ -24,7 +24,12 @@ class AskResponse(BaseModel):
|
|
| 24 |
async def ask(body: AskRequest, user_id: str = Depends(get_user_id)) -> AskResponse:
|
| 25 |
prefixed_session_id = f"{user_id}_{body.session_id}"
|
| 26 |
store = get_vectorstore(prefixed_session_id)
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
if not docs:
|
| 30 |
raise HTTPException(400, "No documents found for this session.")
|
|
|
|
| 24 |
async def ask(body: AskRequest, user_id: str = Depends(get_user_id)) -> AskResponse:
|
| 25 |
prefixed_session_id = f"{user_id}_{body.session_id}"
|
| 26 |
store = get_vectorstore(prefixed_session_id)
|
| 27 |
+
|
| 28 |
+
is_summary_request = any(word in body.question.lower() for word in ["summarize", "summary", "overview", "tl;dr"])
|
| 29 |
+
|
| 30 |
+
k = TOP_K * 2 if is_summary_request else TOP_K
|
| 31 |
+
|
| 32 |
+
docs = store.similarity_search(body.question, k=k)
|
| 33 |
|
| 34 |
if not docs:
|
| 35 |
raise HTTPException(400, "No documents found for this session.")
|