Spaces:
Running
Running
github-actions[bot] commited on
Commit ·
afe8edb
1
Parent(s): cb5ebf4
Deploy from GitHub Actions: 98954c768f33c431a29875735f8c8c7497db8fbb
Browse files- app/config.py +16 -15
- app/rag/pipeline.py +0 -21
- app/routes/upload.py +1 -9
app/config.py
CHANGED
|
@@ -41,21 +41,22 @@ ALLOWED_TYPES = {
|
|
| 41 |
"pptx",
|
| 42 |
}
|
| 43 |
|
| 44 |
-
PROMPT = (
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
| 59 |
|
| 60 |
CREATORS = [
|
| 61 |
{"Krishnendu Das" : "https://itskdhere.com"},
|
|
|
|
| 41 |
"pptx",
|
| 42 |
}
|
| 43 |
|
| 44 |
+
PROMPT = """You are ARC (Augmented Retrieval Chatbot), an intelligent and precise document assistant.
|
| 45 |
+
Your primary goal is to provide accurate, helpful, and well-structured answers based strictly on the provided context.
|
| 46 |
+
|
| 47 |
+
Follow these guidelines:
|
| 48 |
+
1. Base your answers ONLY on the provided context.
|
| 49 |
+
2. Provide a concise yet comprehensive summary of the document content if the user asks for an overview. Focus on the main topics, key points, and overall purpose of the document to help the user understand it at a high level.
|
| 50 |
+
3. If the context contains math or LaTeX, strictly preserve them using $ for inline math and $$ for display math.
|
| 51 |
+
4. If the answer cannot be found in the context, state so honestly. Do not hallucinate or make up information, but be as helpful as possible with the provided information.
|
| 52 |
+
5. Use clear markdown formatting (bullet points, bold text, headings) to structure your response.
|
| 53 |
+
|
| 54 |
+
Context:
|
| 55 |
+
{context}
|
| 56 |
+
|
| 57 |
+
Question:
|
| 58 |
+
{question}
|
| 59 |
+
"""
|
| 60 |
|
| 61 |
CREATORS = [
|
| 62 |
{"Krishnendu Das" : "https://itskdhere.com"},
|
app/rag/pipeline.py
CHANGED
|
@@ -13,8 +13,6 @@ from app.rag.loader import (
|
|
| 13 |
)
|
| 14 |
from app.rag.vectorstore import add_documents
|
| 15 |
from langchain_core.documents import Document
|
| 16 |
-
from app.config import CHAT_MODEL, GOOGLE_API_KEY, SUMMARY_PROMPT
|
| 17 |
-
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 18 |
|
| 19 |
LOADERS = {
|
| 20 |
"pdf": read_pdf,
|
|
@@ -28,7 +26,6 @@ LOADERS = {
|
|
| 28 |
"pptx": read_pptx,
|
| 29 |
}
|
| 30 |
|
| 31 |
-
llm = ChatGoogleGenerativeAI(model=CHAT_MODEL, google_api_key=GOOGLE_API_KEY)
|
| 32 |
|
| 33 |
def _clean_docs(docs: list[Document]) -> list[Document]:
|
| 34 |
for doc in docs:
|
|
@@ -36,14 +33,6 @@ def _clean_docs(docs: list[Document]) -> list[Document]:
|
|
| 36 |
doc.page_content = process_latex(doc.page_content)
|
| 37 |
return docs
|
| 38 |
|
| 39 |
-
def _generate_summary(docs: list[Document]) -> str:
|
| 40 |
-
full_text = "\n\n".join(doc.page_content for doc in docs[:10])
|
| 41 |
-
try:
|
| 42 |
-
response = llm.invoke(SUMMARY_PROMPT.format(content=full_text))
|
| 43 |
-
return str(response.content)
|
| 44 |
-
except Exception as e:
|
| 45 |
-
print(f"Error generating summary: {e}")
|
| 46 |
-
return ""
|
| 47 |
|
| 48 |
def process_file(path: str, ext: str, session_id: str = "default_index") -> int:
|
| 49 |
loader = LOADERS.get(ext.lower())
|
|
@@ -52,16 +41,6 @@ def process_file(path: str, ext: str, session_id: str = "default_index") -> int:
|
|
| 52 |
|
| 53 |
docs = loader(path)
|
| 54 |
docs = _clean_docs(docs)
|
| 55 |
-
# summary_text = _generate_summary(docs)
|
| 56 |
chunks = chunk_docs(docs)
|
| 57 |
-
|
| 58 |
-
# if summary_text:
|
| 59 |
-
# src = docs[0].metadata.get("source", "unknown")
|
| 60 |
-
# summary_doc = Document(
|
| 61 |
-
# page_content=f"DOCUMENT SUMMARY of {src}: {summary_text}",
|
| 62 |
-
# metadata={"source": src, "is_summary": True}
|
| 63 |
-
# )
|
| 64 |
-
# chunks.insert(0, summary_doc)
|
| 65 |
-
|
| 66 |
add_documents(chunks, session_id=session_id)
|
| 67 |
return len(chunks)
|
|
|
|
| 13 |
)
|
| 14 |
from app.rag.vectorstore import add_documents
|
| 15 |
from langchain_core.documents import Document
|
|
|
|
|
|
|
| 16 |
|
| 17 |
LOADERS = {
|
| 18 |
"pdf": read_pdf,
|
|
|
|
| 26 |
"pptx": read_pptx,
|
| 27 |
}
|
| 28 |
|
|
|
|
| 29 |
|
| 30 |
def _clean_docs(docs: list[Document]) -> list[Document]:
|
| 31 |
for doc in docs:
|
|
|
|
| 33 |
doc.page_content = process_latex(doc.page_content)
|
| 34 |
return docs
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
def process_file(path: str, ext: str, session_id: str = "default_index") -> int:
|
| 38 |
loader = LOADERS.get(ext.lower())
|
|
|
|
| 41 |
|
| 42 |
docs = loader(path)
|
| 43 |
docs = _clean_docs(docs)
|
|
|
|
| 44 |
chunks = chunk_docs(docs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
add_documents(chunks, session_id=session_id)
|
| 46 |
return len(chunks)
|
app/routes/upload.py
CHANGED
|
@@ -8,11 +8,7 @@ router = APIRouter()
|
|
| 8 |
|
| 9 |
|
| 10 |
@router.post("/upload")
|
| 11 |
-
async def upload_files(
|
| 12 |
-
files: list[UploadFile] = File(...),
|
| 13 |
-
session_id: str = Form(...),
|
| 14 |
-
user_id: str = Depends(get_user_id)
|
| 15 |
-
) -> dict:
|
| 16 |
prefixed_session_id = f"{user_id}_{session_id}"
|
| 17 |
results = []
|
| 18 |
errors = []
|
|
@@ -26,10 +22,6 @@ async def upload_files(
|
|
| 26 |
for file in files:
|
| 27 |
original_name = file.filename or f"upload.bin"
|
| 28 |
safe_name = os.path.basename(original_name)
|
| 29 |
-
if not safe_name:
|
| 30 |
-
errors.append({"source": original_name, "error": "Invalid filename"})
|
| 31 |
-
continue
|
| 32 |
-
|
| 33 |
ext = safe_name.rsplit(".", 1)[-1].lower()
|
| 34 |
if ext not in ALLOWED_TYPES:
|
| 35 |
errors.append({"source": original_name, "error": f"Unsupported file type: .{ext}"})
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
@router.post("/upload")
|
| 11 |
+
async def upload_files(files: list[UploadFile] = File(...), session_id: str = Form(...), user_id: str = Depends(get_user_id)) -> dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
prefixed_session_id = f"{user_id}_{session_id}"
|
| 13 |
results = []
|
| 14 |
errors = []
|
|
|
|
| 22 |
for file in files:
|
| 23 |
original_name = file.filename or f"upload.bin"
|
| 24 |
safe_name = os.path.basename(original_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
ext = safe_name.rsplit(".", 1)[-1].lower()
|
| 26 |
if ext not in ALLOWED_TYPES:
|
| 27 |
errors.append({"source": original_name, "error": f"Unsupported file type: .{ext}"})
|