github-actions[bot] commited on
Commit
f6a4c20
·
1 Parent(s): b623276

Deploy from GitHub Actions: df12f0dffbbb25b28e353981e621b4fe6afd80f0

Browse files
.dockerignore ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+
162
+ ### Python Patch ###
163
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
164
+ poetry.toml
165
+
166
+ # ruff
167
+ .ruff_cache/
168
+
169
+ # LSP config files
170
+ pyrightconfig.json
.env.example ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # API
2
+ ENV=development # `production` or `development`
3
+ CORS_ORIGINS=["*"]
4
+
5
+ # Gemini
6
+ GOOGLE_API_KEY=your_gemini_api_key
7
+
8
+ # Pinecone
9
+ PINECONE_API_KEY=your_pinecone_api_key
10
+ PINECONE_INDEX_NAME=your_pinecone_index_name
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ .env
2
+ data/
3
+ data
4
+ __pycache__/
5
+ __pycache__
6
+ *venv/
7
+ *venv
8
+
9
+ docs/
Dockerfile ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ RUN apt-get update && apt-get install -y --no-install-recommends \
4
+ ca-certificates \
5
+ libmagic1 \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
9
+
10
+ ENV ENV=production \
11
+ PORT=7860 \
12
+ PYTHONUNBUFFERED=1 \
13
+ PYTHONDONTWRITEBYTECODE=1
14
+
15
+ RUN useradd -m -u 1000 user
16
+ USER user
17
+ ENV HOME=/home/user \
18
+ PATH=/home/user/.local/bin:$PATH
19
+
20
+ WORKDIR $HOME/app
21
+
22
+ COPY --chown=user requirements.txt .
23
+ RUN uv pip install --no-cache --system -r requirements.txt
24
+
25
+ COPY --chown=user app app
26
+
27
+ EXPOSE ${PORT}
28
+
29
+ CMD ["sh", "-c", "uvicorn app.main:app --host 0.0.0.0 --port ${PORT}"]
README.md CHANGED
@@ -1,11 +1,42 @@
1
  ---
2
  title: ARC
3
- emoji: 🌍
4
- colorFrom: red
5
- colorTo: blue
6
  sdk: docker
 
7
  pinned: false
8
  short_description: Augmented Retrieval Chatbot
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: ARC
3
+ emoji:
4
+ colorFrom: green
5
+ colorTo: yellow
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
  short_description: Augmented Retrieval Chatbot
10
  ---
11
 
12
+ ## Backend
13
+
14
+ ### Run Server
15
+
16
+ ```bash
17
+ cd backend
18
+ ```
19
+
20
+ ```bash
21
+ uv venv .venv
22
+ ```
23
+
24
+ ```bash
25
+ .venv\Scripts\activate
26
+ ```
27
+
28
+ ```bash
29
+ uv pip install -r requirements.txt
30
+ ```
31
+
32
+ ```bash
33
+ uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
34
+ ```
35
+
36
+ ---
37
+
38
+ ### Clean Cache
39
+
40
+ ```bash
41
+ Get-ChildItem -Path . -Include **pycache** -Recurse -Force | Remove-Item -Recurse -Force
42
+ ```
app/config.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+
8
+ ENV = os.getenv("ENV", "development")
9
+ CORS_ORIGINS_STR = os.getenv("CORS_ORIGINS", '["*"]')
10
+ CORS_ORIGINS = json.loads(CORS_ORIGINS_STR)
11
+
12
+ APP_NAME = "ARC API"
13
+ APP_VERSION = "2.0.0"
14
+
15
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
16
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
17
+ PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
18
+
19
+ EMBED_MODEL = "models/gemini-embedding-001"
20
+ CHAT_MODEL = "gemini-2.5-flash-lite"
21
+
22
+ TOP_K = 10
23
+ CHUNK_SIZE = 1500
24
+ CHUNK_OVERLAP = 200
25
+ UPLOAD_BATCH_SIZE = 100
26
+
27
+ MAX_FILE_COUNT = 6
28
+ MAX_FILE_SIZE = 5 * 1024 * 1024
29
+
30
+ UPLOAD_DIR = "data/uploads"
31
+
32
+ ALLOWED_TYPES = {
33
+ "pdf",
34
+ "docx",
35
+ "xlsx",
36
+ "csv",
37
+ "pptx",
38
+ "txt",
39
+ "md",
40
+ "json",
41
+ }
42
+
43
+ PROMPT = (
44
+ "You are ARC, a helpful document assistant. "
45
+ "Answer the question based ONLY on the provided context. "
46
+ "If the context contains math or LaTeX, preserve them using $ for inline and $$ for display math. "
47
+ "If you cannot answer from the context, say so honestly. "
48
+ "Context: {context} Question: {question}"
49
+ )
50
+
51
+ CREATORS = [
52
+ {"name": "Krishnendu Das", "url": "https://itskdhere.com"},
53
+ {"name": "Saptarshi Roy", "url": "https://hirishi.in"}
54
+ ]
app/main.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.routes.ask import router as ask_router
2
+ from app.routes.delete import router as delete_router
3
+ from app.routes.clear import router as clear_router
4
+ from app.routes.chats import router as chats_router
5
+ from app.routes.upload import router as upload_router
6
+ from app.config import APP_NAME, APP_VERSION, CORS_ORIGINS, ENV, CREATORS
7
+ from fastapi import FastAPI
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ from fastapi.staticfiles import StaticFiles
10
+ from fastapi.responses import FileResponse
11
+ import os
12
+
13
+ app = FastAPI(
14
+ title=APP_NAME,
15
+ version=APP_VERSION,
16
+ description="Augmented Retrieval Chatbot - API",
17
+ docs_url=None if ENV == "production" else "/docs",
18
+ redoc_url=None if ENV == "production" else "/redoc",
19
+ openapi_url=None if ENV == "production" else "/openapi.json",
20
+ )
21
+
22
+ app.add_middleware(
23
+ CORSMiddleware,
24
+ allow_origins=CORS_ORIGINS,
25
+ allow_credentials=True,
26
+ allow_methods=["*"],
27
+ allow_headers=["*"],
28
+ )
29
+
30
+ app.include_router(ask_router)
31
+ app.include_router(upload_router)
32
+ app.include_router(delete_router)
33
+ app.include_router(clear_router)
34
+ app.include_router(chats_router)
35
+
36
+
37
+ @app.get("/")
38
+ async def root():
39
+ return {
40
+ "name": APP_NAME,
41
+ "version": APP_VERSION,
42
+ "status": "OK",
43
+ "creators": CREATORS
44
+ }
45
+
46
+
47
+ app.mount("/static", StaticFiles(directory="app/static"), name="static")
48
+
49
+
50
+ @app.get('/favicon.ico', include_in_schema=False)
51
+ async def favicon():
52
+ return FileResponse(os.path.join("app", "static", "favicon.ico"))
app/rag/chunker.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.config import CHUNK_OVERLAP, CHUNK_SIZE
2
+ from langchain_core.documents import Document
3
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
4
+
5
+ text_splitter = RecursiveCharacterTextSplitter(
6
+ chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
7
+ )
8
+
9
+
10
+ def chunk_docs(docs: list[Document]) -> list[Document]:
11
+ return text_splitter.split_documents(docs)
12
+
13
+
14
+ # https://docs.langchain.com/oss/python/integrations/splitters
app/rag/cleaner.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def clean_text(text: str) -> str:
5
+ if not text:
6
+ return ""
7
+ text = text.replace("\x00", "")
8
+ text = re.sub(r"\n{3,}", "\n\n", text)
9
+ text = re.sub(r"[ \t]+", " ", text)
10
+ return text.strip()
11
+
12
+
13
+ def process_latex(text: str) -> str:
14
+ if not text:
15
+ return text
16
+
17
+ text = re.sub(r"\\\[(.*?)\\\]", r"$$\1$$", text, flags=re.DOTALL)
18
+ text = re.sub(r"\\\((.*?)\\\)", r"$\1$", text, flags=re.DOTALL)
19
+
20
+ display = r"equation|align|gather|displaymath|eqnarray|multline|flalign|split"
21
+ text = re.sub(
22
+ rf"\\begin{{({display})\*?}}(.*?)\\end{{\1\*?}}",
23
+ r"$$\2$$",
24
+ text,
25
+ flags=re.DOTALL,
26
+ )
27
+
28
+ matrix = r"matrix|pmatrix|bmatrix|vmatrix|Bmatrix|cases|array"
29
+ text = re.sub(
30
+ rf"(?<!\$)\\begin{{({matrix})\*?}}(.*?)\\end{{\1\*?}}",
31
+ r"$$\\begin{\1}\2\\end{\1}$$",
32
+ text,
33
+ flags=re.DOTALL,
34
+ )
35
+
36
+ return re.sub(r"\${3,}", "$$", text)
37
+
app/rag/embedder.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.config import EMBED_MODEL, GOOGLE_API_KEY
2
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
3
+
4
+ embeddings = GoogleGenerativeAIEmbeddings(
5
+ model=EMBED_MODEL,
6
+ google_api_key=GOOGLE_API_KEY,
7
+ output_dimensionality=768,
8
+ )
9
+
10
+
11
+ # https://python.langchain.com/docs/integrations/text_embedding/google_generative_ai/
app/rag/loader.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import (
2
+ CSVLoader,
3
+ Docx2txtLoader,
4
+ JSONLoader,
5
+ PDFPlumberLoader,
6
+ TextLoader,
7
+ UnstructuredExcelLoader,
8
+ UnstructuredMarkdownLoader,
9
+ UnstructuredPowerPointLoader,
10
+ )
11
+ from langchain_core.documents import Document
12
+
13
+
14
+ # PDF
15
+ # https://python.langchain.com/docs/integrations/document_loaders/pdfplumber
16
+ def read_pdf(path: str) -> list[Document]:
17
+ loader = PDFPlumberLoader(path)
18
+ docs = loader.load()
19
+ return docs
20
+
21
+
22
+ # TXT
23
+ # https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.text.TextLoader
24
+ def read_txt(path: str) -> list[Document]:
25
+ loader = TextLoader(path, encoding="utf-8")
26
+ docs = loader.load()
27
+ return docs
28
+
29
+
30
+ # CSV
31
+ # https://python.langchain.com/docs/integrations/document_loaders/csv
32
+ def read_csv(path: str) -> list[Document]:
33
+ loader = CSVLoader(file_path=path)
34
+ docs = loader.load()
35
+ return docs
36
+
37
+
38
+ # MD
39
+ # https://python.langchain.com/docs/integrations/document_loaders/unstructured_file/
40
+ def read_md(path: str) -> list[Document]:
41
+ loader = UnstructuredMarkdownLoader(path)
42
+ docs = loader.load()
43
+ return docs
44
+
45
+
46
+ # JSON
47
+ # https://python.langchain.com/docs/integrations/document_loaders/json
48
+ def read_json(path: str) -> list[Document]:
49
+ loader = JSONLoader(file_path=path, jq_schema=".", text_content=False)
50
+ docs = loader.load()
51
+ return docs
52
+
53
+
54
+ # DOCX
55
+ # https://python.langchain.com/docs/integrations/document_loaders/microsoft_word
56
+ def read_docx(path: str) -> list[Document]:
57
+ loader = Docx2txtLoader(path)
58
+ docs = loader.load()
59
+ return docs
60
+
61
+
62
+ # XLSX
63
+ # https://python.langchain.com/docs/integrations/document_loaders/microsoft_excel
64
+ def read_xlsx(path: str) -> list[Document]:
65
+ loader = UnstructuredExcelLoader(path, mode="elements")
66
+ docs = loader.load()
67
+ return docs
68
+
69
+
70
+ # PPTX
71
+ # https://python.langchain.com/docs/integrations/document_loaders/microsoft_powerpoint
72
+ def read_pptx(path: str) -> list[Document]:
73
+ loader = UnstructuredPowerPointLoader(path, mode="elements")
74
+ docs = loader.load()
75
+ return docs
76
+
app/rag/pipeline.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.rag.chunker import chunk_docs
2
+ from app.rag.cleaner import clean_text, process_latex
3
+ from app.rag.loader import (
4
+ read_csv,
5
+ read_docx,
6
+ read_json,
7
+ read_md,
8
+ read_pdf,
9
+ read_pptx,
10
+ read_txt,
11
+ read_xlsx,
12
+ )
13
+ from app.rag.vectorstore import add_documents
14
+ from langchain_core.documents import Document
15
+
16
+ LOADERS = {
17
+ "pdf": read_pdf,
18
+ "txt": read_txt,
19
+ "csv": read_csv,
20
+ "md": read_md,
21
+ "json": read_json,
22
+ "docx": read_docx,
23
+ "xlsx": read_xlsx,
24
+ "pptx": read_pptx,
25
+ }
26
+
27
+
28
+ def _clean_docs(docs: list[Document]) -> list[Document]:
29
+ for doc in docs:
30
+ doc.page_content = clean_text(doc.page_content)
31
+ doc.page_content = process_latex(doc.page_content)
32
+ return docs
33
+
34
+
35
+ def process_file(path: str, ext: str, session_id: str = "default_index") -> int:
36
+ loader = LOADERS.get(ext.lower())
37
+ if loader is None:
38
+ raise ValueError(f"Unsupported file type: .{ext}")
39
+ docs = loader(path)
40
+ docs = _clean_docs(docs)
41
+ chunks = chunk_docs(docs)
42
+ add_documents(chunks, session_id=session_id)
43
+ return len(chunks)
app/rag/vectorstore.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ import time
3
+
4
+ from app.config import PINECONE_API_KEY, PINECONE_INDEX_NAME, UPLOAD_BATCH_SIZE
5
+ from app.rag.embedder import embeddings
6
+ from langchain_core.documents import Document
7
+ from langchain_pinecone import PineconeVectorStore
8
+ from pinecone import Pinecone
9
+
10
+ _pinecone_index = None
11
+ _pinecone_lock = threading.Lock()
12
+
13
+
14
+ def _get_index():
15
+ global _pinecone_index
16
+ if _pinecone_index is None:
17
+ with _pinecone_lock:
18
+ if _pinecone_index is None:
19
+ pc = Pinecone(api_key=PINECONE_API_KEY)
20
+ _pinecone_index = pc.Index(PINECONE_INDEX_NAME)
21
+ return _pinecone_index
22
+
23
+
24
+ def get_vectorstore(session_id: str = "default_index") -> PineconeVectorStore:
25
+ return PineconeVectorStore(
26
+ index_name=PINECONE_INDEX_NAME,
27
+ embedding=embeddings,
28
+ pinecone_api_key=PINECONE_API_KEY,
29
+ namespace=session_id,
30
+ )
31
+
32
+
33
+ def add_documents(chunks: list[Document], session_id: str = "default_index") -> None:
34
+ if not chunks:
35
+ raise ValueError("No text could be extracted from the file.")
36
+
37
+ store = get_vectorstore(session_id)
38
+ for i in range(0, len(chunks), UPLOAD_BATCH_SIZE):
39
+ batch = chunks[i : i + UPLOAD_BATCH_SIZE]
40
+ store.add_documents(batch)
41
+ if i + UPLOAD_BATCH_SIZE < len(chunks):
42
+ time.sleep(0.5)
43
+
44
+
45
+ def delete_vectorstore(session_id: str) -> bool:
46
+ try:
47
+ index = _get_index()
48
+ index.delete(delete_all=True, namespace=session_id)
49
+ return True
50
+ except Exception as e:
51
+ print(f"delete_vectorstore: failed to delete namespace '{session_id}': {e}")
52
+ return False
53
+
54
+
55
+ def delete_all_vectorstores() -> bool:
56
+ try:
57
+ index = _get_index()
58
+ stats = index.describe_index_stats()
59
+ namespaces = list(stats.namespaces.keys())
60
+ failed: list[str] = []
61
+ for ns in namespaces:
62
+ try:
63
+ index.delete(delete_all=True, namespace=ns)
64
+ except Exception as e:
65
+ print(f"Failed to delete namespace '{ns}': {e}")
66
+ failed.append(ns)
67
+ if failed:
68
+ print(f"delete_all_vectorstores: {len(failed)}/{len(namespaces)} namespaces failed: {failed}")
69
+ return False
70
+ return True
71
+ except Exception as e:
72
+ print(f"delete_all_vectorstores: unexpected error: {e}")
73
+ return False
app/routes/ask.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.config import CHAT_MODEL, GOOGLE_API_KEY, PROMPT, TOP_K
2
+ from app.rag.vectorstore import get_vectorstore
3
+ from fastapi import APIRouter, HTTPException
4
+ from langchain_core.messages import HumanMessage
5
+ from langchain_google_genai import ChatGoogleGenerativeAI
6
+ from pydantic import BaseModel
7
+
8
+ router = APIRouter()
9
+
10
+ llm = ChatGoogleGenerativeAI(model=CHAT_MODEL, google_api_key=GOOGLE_API_KEY)
11
+
12
+
13
+ class AskRequest(BaseModel):
14
+ question: str
15
+ session_id: str = "default_index"
16
+
17
+
18
+ class AskResponse(BaseModel):
19
+ answer: str
20
+
21
+
22
+ @router.post("/ask")
23
+ async def ask(body: AskRequest) -> AskResponse:
24
+ store = get_vectorstore(body.session_id)
25
+ docs = store.similarity_search(body.question, k=TOP_K)
26
+
27
+ if not docs:
28
+ raise HTTPException(400, "No documents found for this session.")
29
+
30
+ context = "\n\n".join(d.page_content for d in docs)
31
+ response = llm.invoke([HumanMessage(content=PROMPT.format(context=context, question=body.question))])
32
+
33
+ return AskResponse(answer=str(response.content))
app/routes/chats.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from datetime import datetime
3
+ from app.rag.vectorstore import _get_index
4
+ from fastapi import APIRouter, HTTPException
5
+
6
+ router = APIRouter()
7
+
8
+
9
+ @router.get("/chats")
10
+ async def get_chats() -> dict:
11
+ try:
12
+ index = _get_index()
13
+ stats = index.describe_index_stats()
14
+
15
+ namespaces = list(stats.namespaces.keys())
16
+ chats = []
17
+ for ns in namespaces:
18
+ if not re.fullmatch(r'\d+', ns):
19
+ continue
20
+
21
+ timestamp = int(ns) / 1000.0
22
+ dt = datetime.fromtimestamp(timestamp)
23
+
24
+ chats.append({
25
+ "id": ns,
26
+ "title": f"Analysis {dt.strftime('%H:%M:%S')}",
27
+ "date": dt.strftime('%Y-%m-%d')
28
+ })
29
+
30
+ chats.sort(key=lambda x: int(x["id"]), reverse=True)
31
+ return {"chats": chats}
32
+ except Exception as e:
33
+ print(f"Error fetching chats: {e}")
34
+ raise HTTPException(500, "Failed to fetch chats from Pinecone.")
app/routes/clear.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.rag.vectorstore import delete_all_vectorstores
2
+ from fastapi import APIRouter, HTTPException
3
+
4
+ router = APIRouter()
5
+
6
+
7
+ @router.delete("/clear")
8
+ async def clear_index() -> dict:
9
+ if not delete_all_vectorstores():
10
+ raise HTTPException(500, "Failed to clear the vector store.")
11
+ return {"message": "All vector stores cleared."}
app/routes/delete.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.rag.vectorstore import delete_vectorstore
2
+ from fastapi import APIRouter, HTTPException
3
+
4
+ router = APIRouter()
5
+
6
+
7
+ @router.delete("/delete/{session_id}")
8
+ async def delete_specific_chat(session_id: str) -> dict:
9
+ if not delete_vectorstore(session_id):
10
+ raise HTTPException(404, f"No vector store found for session: {session_id}")
11
+ return {"message": f"Vector store for session {session_id} deleted."}
app/routes/upload.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from app.config import ALLOWED_TYPES, MAX_FILE_COUNT, MAX_FILE_SIZE, UPLOAD_DIR
3
+ from app.rag.pipeline import process_file
4
+ from fastapi import APIRouter, File, Form, HTTPException, UploadFile
5
+
6
+ router = APIRouter()
7
+
8
+
9
+ @router.post("/upload")
10
+ async def upload_files(files: list[UploadFile] = File(...), session_id: str = Form(...)) -> dict:
11
+ results = []
12
+ errors = []
13
+ total_chunks = 0
14
+
15
+ if len(files) > MAX_FILE_COUNT:
16
+ raise HTTPException(400, f"Maximum {MAX_FILE_COUNT} files allowed")
17
+
18
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
19
+
20
+ for file in files:
21
+ original_name = file.filename or f"upload.bin"
22
+ safe_name = os.path.basename(original_name)
23
+ if not safe_name:
24
+ errors.append({"source": original_name, "error": "Invalid filename"})
25
+ continue
26
+
27
+ ext = safe_name.rsplit(".", 1)[-1].lower()
28
+ if ext not in ALLOWED_TYPES:
29
+ errors.append({"source": original_name, "error": f"Unsupported file type: .{ext}"})
30
+ continue
31
+
32
+ content = await file.read()
33
+ if len(content) > MAX_FILE_SIZE:
34
+ errors.append({"source": original_name, "error": "File too large"})
35
+ continue
36
+
37
+ path = os.path.join(UPLOAD_DIR, safe_name)
38
+
39
+ try:
40
+ with open(path, "wb") as f:
41
+ f.write(content)
42
+ chunks = process_file(path, ext, session_id=session_id)
43
+ total_chunks += chunks
44
+ results.append({"source": original_name, "chunks": chunks})
45
+ except Exception as e:
46
+ errors.append({"source": original_name, "error": str(e)})
47
+ finally:
48
+ if os.path.exists(path):
49
+ os.remove(path)
50
+
51
+ if not results and errors:
52
+ raise HTTPException(422, {"message": "All files failed to process", "errors": errors})
53
+
54
+ return {
55
+ "total_files": len(files),
56
+ "total_chunks": total_chunks,
57
+ "details": results,
58
+ "errors": errors,
59
+ }
app/static/favicon.ico ADDED
package.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "arc-backend",
3
+ "version": "2.0.0",
4
+ "private": true,
5
+ "scripts": {
6
+ "dev": ".venv\\Scripts\\activate && uvicorn app.main:app --reload --host 0.0.0.0 --port 8000",
7
+ "start": "uvicorn app.main:app --host 0.0.0.0 --port 8000"
8
+ }
9
+ }
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ python-multipart
4
+
5
+ langchain-community
6
+ langchain-core
7
+ langchain-text-splitters
8
+ langchain-google-genai
9
+
10
+ pinecone-client
11
+ langchain-pinecone
12
+
13
+ google-generativeai
14
+
15
+ pdfplumber
16
+ docx2txt
17
+ openpyxl
18
+ python-pptx
19
+ unstructured
20
+ markdown
21
+ jq
22
+
23
+ python-dotenv
vercel.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builds": [
3
+ {
4
+ "src": "app/main.py",
5
+ "use": "@vercel/python"
6
+ }
7
+ ],
8
+ "routes": [
9
+ {
10
+ "src": "/(.*)",
11
+ "dest": "app/main.py"
12
+ }
13
+ ]
14
+ }