karani10 commited on
Commit
b01addc
·
1 Parent(s): 7d25988

add files

Browse files
Files changed (4) hide show
  1. DockerFile +0 -13
  2. Dockerfile +0 -2
  3. app.py +187 -4
  4. requirements.txt +26 -2
DockerFile DELETED
@@ -1,13 +0,0 @@
1
- FROM python:3.9
2
-
3
- RUN useradd -m -u 1000 user
4
- USER user
5
- ENV PATH="/home/user/.local/bin:$PATH"
6
-
7
- WORKDIR /app
8
-
9
- COPY --chown=user ./requirements.txt requirements.txt
10
- RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
-
12
- COPY --chown=user . /app
13
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile CHANGED
@@ -1,5 +1,3 @@
1
- # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
- # you will also find guides on how best to write your Dockerfile
3
 
4
  FROM python:3.9
5
 
 
 
 
1
 
2
  FROM python:3.9
3
 
app.py CHANGED
@@ -1,7 +1,190 @@
1
- from fastapi import FastAPI
 
 
 
 
2
 
3
- app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  @app.get("/")
6
- def greet_json():
7
- return {"Hello": "World!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import tempfile
4
+ from contextlib import asynccontextmanager
5
+ from typing import Annotated
6
 
7
+ from dotenv import dotenv_values
8
+ from fastapi import FastAPI, File, Form, HTTPException, UploadFile
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from pydantic import BaseModel
11
+
12
+
13
+ from helpers import (
14
+ generate_embedding_doc,
15
+ get_text_from_pdf,
16
+ run_rag_pipeline,
17
+ split_doc_chunks,
18
+ )
19
+
20
+ # --------------------------------------------------
21
+ # CONFIG
22
+ # --------------------------------------------------
23
+
24
+ config = dotenv_values(".env")
25
+
26
+ GROQ_API_KEY = config.get(
27
+ "GROQ_API_KEY",
28
+ os.getenv("GROQ_API_KEY", "")
29
+ )
30
+
31
+ if not GROQ_API_KEY:
32
+ raise RuntimeError("Missing GROQ_API_KEY")
33
+
34
+ # --------------------------------------------------
35
+ # SIMPLE LIST STORAGE: I Don't Use Chroma DB --> Deployment Causes
36
+ # --------------------------------------------------
37
+
38
+ chunked_documents = []
39
+
40
+ # --------------------------------------------------
41
+ # FASTAPI
42
+ # --------------------------------------------------
43
+
44
+ @asynccontextmanager
45
+ async def lifespan(app: FastAPI):
46
+ print("API Started")
47
+ yield
48
+ print("API Stopped")
49
+
50
+ app = FastAPI(
51
+ title="Simple RAG API",
52
+ lifespan=lifespan,
53
+ )
54
+
55
+ app.add_middleware(
56
+ CORSMiddleware,
57
+ allow_origins=["*"],
58
+ allow_methods=["*"],
59
+ allow_headers=["*"],
60
+ )
61
+
62
+ # --------------------------------------------------
63
+ # SCHEMAS
64
+ # --------------------------------------------------
65
+
66
+ class QueryRequest(BaseModel):
67
+ question: str
68
+ top_k: int = 5
69
+ rerank_top_k: int = 3
70
+
71
+ # --------------------------------------------------
72
+ # ROUTES
73
+ # --------------------------------------------------
74
 
75
  @app.get("/")
76
+ def home():
77
+ return {
78
+ "message": "RAG API Running"
79
+ }
80
+
81
+ # --------------------------------------------------
82
+ # UPLOAD PDF
83
+ # --------------------------------------------------
84
+
85
+ @app.post("/upload-pdf")
86
+ async def upload_pdf(
87
+ file: Annotated[
88
+ UploadFile,
89
+ File(description="PDF file")
90
+ ],
91
+ ):
92
+ print("FILE SEND: ", file)
93
+
94
+ global chunked_documents
95
+
96
+ # -------------------------------
97
+ # CHECK PDF
98
+ # -------------------------------
99
+
100
+ if not file.filename.endswith(".pdf"):
101
+
102
+ raise HTTPException(
103
+ status_code=400,
104
+ detail="Only PDF allowed"
105
+ )
106
+
107
+ # -------------------------------
108
+ # SAVE TEMP PDF
109
+ # -------------------------------
110
+
111
+ with tempfile.NamedTemporaryFile(
112
+ delete=False,
113
+ suffix=".pdf"
114
+ ) as tmp:
115
+
116
+ shutil.copyfileobj(file.file, tmp)
117
+
118
+ tmp_path = tmp.name
119
+
120
+ try:
121
+
122
+ # -------------------------------
123
+ # EXTRACT TEXT
124
+ # -------------------------------
125
+
126
+ documents = get_text_from_pdf(tmp_path)
127
+
128
+ if not documents:
129
+
130
+ raise HTTPException(
131
+ status_code=400,
132
+ detail="No text found"
133
+ )
134
+
135
+ # -------------------------------
136
+ # CHUNKING
137
+ # -------------------------------
138
+
139
+ chunked_documents = split_doc_chunks(
140
+ documents
141
+ )
142
+
143
+ # -------------------------------
144
+ # GENERATE EMBEDDINGS
145
+ # -------------------------------
146
+
147
+ chunked_documents = generate_embedding_doc(
148
+ chunked_documents
149
+ )
150
+
151
+ return {
152
+ "message": "PDF indexed successfully",
153
+ "chunks": len(chunked_documents)
154
+ }
155
+
156
+ finally:
157
+
158
+ os.unlink(tmp_path)
159
+
160
+ # --------------------------------------------------
161
+ # QUERY
162
+ # --------------------------------------------------
163
+
164
+ @app.post("/query")
165
+ def query(req: QueryRequest):
166
+
167
+ global chunked_documents
168
+ print("Question", req)
169
+
170
+ if not chunked_documents:
171
+
172
+ raise HTTPException(
173
+ status_code=400,
174
+ detail="Upload PDF first"
175
+ )
176
+
177
+ answer = run_rag_pipeline(
178
+ question=req.question,
179
+ chunked_documents=chunked_documents,
180
+ groq_api_key=GROQ_API_KEY,
181
+ top_k=req.top_k,
182
+ rerank_top_k=req.rerank_top_k,
183
+ )
184
+
185
+ return {
186
+ "question": req.question,
187
+ "answer": answer,
188
+ }
189
+
190
+
requirements.txt CHANGED
@@ -1,2 +1,26 @@
1
- fastapi
2
- uvicorn[standard]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG API — Python dependencies
2
+
3
+ # Web framework
4
+ fastapi>=0.111.0
5
+ uvicorn[standard]>=0.29.0
6
+ python-multipart>=0.0.9
7
+
8
+ # Environment
9
+ python-dotenv>=1.0.0
10
+
11
+ # LLM
12
+ groq>=0.9.0
13
+
14
+ # Embeddings
15
+ sentence-transformers>=3.0.0
16
+
17
+ # PDF extraction
18
+ pdfplumber>=0.11.0
19
+
20
+ # Retrieval / ranking
21
+ rank-bm25>=0.2.2
22
+ scikit-learn>=1.4.0
23
+ numpy>=1.26.0
24
+
25
+ # Validation
26
+ pydantic>=2.7.0