github-actions[bot] commited on
Commit
c1b316f
·
1 Parent(s): 177b4ac

Deploy from GitHub Actions: 37b5d8a4f4600eb83d09a6e8bb178d0f7e6bc890

Browse files
Files changed (4) hide show
  1. app/config.py +11 -2
  2. app/rag/loader.py +7 -0
  3. app/rag/pipeline.py +24 -0
  4. app/routes/ask.py +6 -1
app/config.py CHANGED
@@ -38,16 +38,25 @@ ALLOWED_TYPES = {
38
  "txt",
39
  "md",
40
  "json",
 
41
  }
42
 
43
  PROMPT = (
44
  "You are ARC, a helpful document assistant. "
45
- "Answer the question based ONLY on the provided context. "
 
46
  "If the context contains math or LaTeX, preserve them using $ for inline and $$ for display math. "
47
- "If you cannot answer from the context, say so honestly. "
48
  "Context: {context} Question: {question}"
49
  )
50
 
 
 
 
 
 
 
 
51
  CREATORS = [
52
  {"Krishnendu Das" : "https://itskdhere.com"},
53
  {"Saptarshi Roy" : "https://hirishi.in"}
 
38
  "txt",
39
  "md",
40
  "json",
41
+ "tex",
42
  }
43
 
44
  PROMPT = (
45
  "You are ARC, a helpful document assistant. "
46
+ "Your goal is to provide accurate and helpful answers based on the context provided. "
47
+ "If the user asks for a summary, synthesize the context into a clear, structured overview. "
48
  "If the context contains math or LaTeX, preserve them using $ for inline and $$ for display math. "
49
+ "If you cannot find the answer in the context, say so honestly, but try to be as helpful as possible with the information you have. "
50
  "Context: {context} Question: {question}"
51
  )
52
 
53
+ SUMMARY_PROMPT = (
54
+ "Provide a concise yet comprehensive summary of the following document content. "
55
+ "Focus on the main topics, key points, and overall purpose of the document. "
56
+ "This summary will be used to help a chatbot understand the document at a high level. "
57
+ "Content: {content}"
58
+ )
59
+
60
  CREATORS = [
61
  {"Krishnendu Das" : "https://itskdhere.com"},
62
  {"Saptarshi Roy" : "https://hirishi.in"}
app/rag/loader.py CHANGED
@@ -74,3 +74,10 @@ def read_pptx(path: str) -> list[Document]:
74
  docs = loader.load()
75
  return docs
76
 
 
 
 
 
 
 
 
 
74
  docs = loader.load()
75
  return docs
76
 
77
+
78
+ # TEX
79
+ # https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.text.TextLoader
80
+ def read_tex(path: str) -> list[Document]:
81
+ loader = TextLoader(path, encoding="utf-8")
82
+ docs = loader.load()
83
+ return docs
app/rag/pipeline.py CHANGED
@@ -9,9 +9,12 @@ from app.rag.loader import (
9
  read_pptx,
10
  read_txt,
11
  read_xlsx,
 
12
  )
13
  from app.rag.vectorstore import add_documents
14
  from langchain_core.documents import Document
 
 
15
 
16
  LOADERS = {
17
  "pdf": read_pdf,
@@ -22,8 +25,10 @@ LOADERS = {
22
  "docx": read_docx,
23
  "xlsx": read_xlsx,
24
  "pptx": read_pptx,
 
25
  }
26
 
 
27
 
28
  def _clean_docs(docs: list[Document]) -> list[Document]:
29
  for doc in docs:
@@ -31,13 +36,32 @@ def _clean_docs(docs: list[Document]) -> list[Document]:
31
  doc.page_content = process_latex(doc.page_content)
32
  return docs
33
 
 
 
 
 
 
 
 
 
34
 
35
  def process_file(path: str, ext: str, session_id: str = "default_index") -> int:
36
  loader = LOADERS.get(ext.lower())
37
  if loader is None:
38
  raise ValueError(f"Unsupported file type: .{ext}")
 
39
  docs = loader(path)
40
  docs = _clean_docs(docs)
 
41
  chunks = chunk_docs(docs)
 
 
 
 
 
 
 
 
 
42
  add_documents(chunks, session_id=session_id)
43
  return len(chunks)
 
9
  read_pptx,
10
  read_txt,
11
  read_xlsx,
12
+ read_tex,
13
  )
14
  from app.rag.vectorstore import add_documents
15
  from langchain_core.documents import Document
16
+ from app.config import CHAT_MODEL, GOOGLE_API_KEY, SUMMARY_PROMPT
17
+ from langchain_google_genai import ChatGoogleGenerativeAI
18
 
19
  LOADERS = {
20
  "pdf": read_pdf,
 
25
  "docx": read_docx,
26
  "xlsx": read_xlsx,
27
  "pptx": read_pptx,
28
+ "tex": read_tex,
29
  }
30
 
31
+ llm = ChatGoogleGenerativeAI(model=CHAT_MODEL, google_api_key=GOOGLE_API_KEY)
32
 
33
  def _clean_docs(docs: list[Document]) -> list[Document]:
34
  for doc in docs:
 
36
  doc.page_content = process_latex(doc.page_content)
37
  return docs
38
 
39
+ def _generate_summary(docs: list[Document]) -> str:
40
+ full_text = "\n\n".join(doc.page_content for doc in docs[:10])
41
+ try:
42
+ response = llm.invoke(SUMMARY_PROMPT.format(content=full_text))
43
+ return str(response.content)
44
+ except Exception as e:
45
+ print(f"Error generating summary: {e}")
46
+ return ""
47
 
48
  def process_file(path: str, ext: str, session_id: str = "default_index") -> int:
49
  loader = LOADERS.get(ext.lower())
50
  if loader is None:
51
  raise ValueError(f"Unsupported file type: .{ext}")
52
+
53
  docs = loader(path)
54
  docs = _clean_docs(docs)
55
+ # summary_text = _generate_summary(docs)
56
  chunks = chunk_docs(docs)
57
+
58
+ # if summary_text:
59
+ # src = docs[0].metadata.get("source", "unknown")
60
+ # summary_doc = Document(
61
+ # page_content=f"DOCUMENT SUMMARY of {src}: {summary_text}",
62
+ # metadata={"source": src, "is_summary": True}
63
+ # )
64
+ # chunks.insert(0, summary_doc)
65
+
66
  add_documents(chunks, session_id=session_id)
67
  return len(chunks)
app/routes/ask.py CHANGED
@@ -24,7 +24,12 @@ class AskResponse(BaseModel):
24
  async def ask(body: AskRequest, user_id: str = Depends(get_user_id)) -> AskResponse:
25
  prefixed_session_id = f"{user_id}_{body.session_id}"
26
  store = get_vectorstore(prefixed_session_id)
27
- docs = store.similarity_search(body.question, k=TOP_K)
 
 
 
 
 
28
 
29
  if not docs:
30
  raise HTTPException(400, "No documents found for this session.")
 
24
  async def ask(body: AskRequest, user_id: str = Depends(get_user_id)) -> AskResponse:
25
  prefixed_session_id = f"{user_id}_{body.session_id}"
26
  store = get_vectorstore(prefixed_session_id)
27
+
28
+ is_summary_request = any(word in body.question.lower() for word in ["summarize", "summary", "overview", "tl;dr"])
29
+
30
+ k = TOP_K * 2 if is_summary_request else TOP_K
31
+
32
+ docs = store.similarity_search(body.question, k=k)
33
 
34
  if not docs:
35
  raise HTTPException(400, "No documents found for this session.")