PRSHNTKUMR commited on
Commit
4a04295
·
verified ·
1 Parent(s): 2405c8d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +271 -0
app.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import json
4
+ import base64
5
+ from fastapi import FastAPI, HTTPException, Header, Depends
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from pydantic import BaseModel
8
+ from typing import List, Optional
9
+ import anthropic
10
+ from google.oauth2 import service_account
11
+ from googleapiclient.discovery import build
12
+ from googleapiclient.http import MediaIoBaseDownload
13
+
14
+ app = FastAPI(title="Dr. Gini DocRAG Service")
15
+
16
+ # CORS - Allow your frontend domains
17
+ app.add_middleware(
18
+ CORSMiddleware,
19
+ allow_origins=[
20
+ "https://your-frontend.netlify.app",
21
+ "https://your-space.hf.space",
22
+ "http://localhost:3000",
23
+ "http://localhost:5173",
24
+ "*" # Remove in production, use specific domains
25
+ ],
26
+ allow_credentials=True,
27
+ allow_methods=["*"],
28
+ allow_headers=["*"],
29
+ )
30
+
31
+ # Simple API key auth (optional but recommended)
32
+ API_KEY = os.environ.get("DOCRAG_API_KEY", "")
33
+
34
+ def verify_api_key(x_api_key: str = Header(None, alias="X-API-Key")):
35
+ """Verify API key if configured"""
36
+ if API_KEY and x_api_key != API_KEY:
37
+ raise HTTPException(status_code=401, detail="Invalid API key")
38
+ return True
39
+
40
+ # Initialize Claude client
41
+ claude_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
42
+
43
+ # Google Drive Service Account
44
+ def get_drive_service():
45
+ """Initialize Google Drive service with service account"""
46
+ service_account_info = json.loads(os.environ.get("GOOGLE_SERVICE_ACCOUNT", "{}"))
47
+
48
+ if not service_account_info:
49
+ raise HTTPException(status_code=500, detail="Google Service Account not configured")
50
+
51
+ credentials = service_account.Credentials.from_service_account_info(
52
+ service_account_info,
53
+ scopes=['https://www.googleapis.com/auth/drive.readonly']
54
+ )
55
+
56
+ return build('drive', 'v3', credentials=credentials)
57
+
58
+
59
+ # ============ Request/Response Models ============
60
+
61
+ class Document(BaseModel):
62
+ driveFileId: str
63
+ fileName: str
64
+ mimeType: str
65
+
66
+
67
+ class DocRAGRequest(BaseModel):
68
+ userId: str
69
+ sessionId: str
70
+ query: str
71
+ selectedDocs: List[Document]
72
+
73
+
74
+ class DocRAGResponse(BaseModel):
75
+ success: bool
76
+ query: str
77
+ answer: Optional[str] = None
78
+ documentsUsed: List[str] = []
79
+ error: Optional[str] = None
80
+
81
+
82
+ # ============ Helper Functions ============
83
+
84
+ def download_from_drive(drive_service, file_id: str, file_name: str) -> bytes:
85
+ """Download file from Google Drive using service account"""
86
+ try:
87
+ request = drive_service.files().get_media(fileId=file_id)
88
+
89
+ file_buffer = io.BytesIO()
90
+ downloader = MediaIoBaseDownload(file_buffer, request)
91
+
92
+ done = False
93
+ while not done:
94
+ status, done = downloader.next_chunk()
95
+
96
+ file_buffer.seek(0)
97
+ return file_buffer.read()
98
+
99
+ except Exception as e:
100
+ raise HTTPException(
101
+ status_code=400,
102
+ detail=f"Failed to download {file_name}: {str(e)}"
103
+ )
104
+
105
+
106
+ def get_claude_media_type(mime_type: str) -> tuple[str, str]:
107
+ """Map MIME type to Claude's supported types"""
108
+
109
+ if mime_type == "application/pdf":
110
+ return "document", "application/pdf"
111
+
112
+ if mime_type in ["image/jpeg", "image/png", "image/gif", "image/webp"]:
113
+ return "image", mime_type
114
+
115
+ if mime_type in ["text/plain", "text/csv", "text/html", "text/markdown",
116
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]:
117
+ return "text", mime_type
118
+
119
+ return "document", "application/pdf"
120
+
121
+
122
+ # ============ API Endpoints ============
123
+
124
+ @app.post("/docrag", response_model=DocRAGResponse)
125
+ async def chat_with_documents(
126
+ request: DocRAGRequest,
127
+ authenticated: bool = Depends(verify_api_key)
128
+ ):
129
+ """Chat with uploaded documents using Claude"""
130
+
131
+ try:
132
+ drive_service = get_drive_service()
133
+
134
+ # Download all selected documents
135
+ documents_content = []
136
+ documents_used = []
137
+
138
+ for doc in request.selectedDocs:
139
+ try:
140
+ file_bytes = download_from_drive(
141
+ drive_service,
142
+ doc.driveFileId,
143
+ doc.fileName
144
+ )
145
+ documents_content.append({
146
+ "fileName": doc.fileName,
147
+ "content": file_bytes,
148
+ "mimeType": doc.mimeType
149
+ })
150
+ documents_used.append(doc.fileName)
151
+ print(f"✓ Downloaded: {doc.fileName}")
152
+
153
+ except Exception as e:
154
+ print(f"✗ Error downloading {doc.fileName}: {e}")
155
+ continue
156
+
157
+ if not documents_content:
158
+ return DocRAGResponse(
159
+ success=False,
160
+ query=request.query,
161
+ error="Could not download any documents. Check if folder is shared with service account."
162
+ )
163
+
164
+ # Build Claude message
165
+ content = []
166
+
167
+ for doc in documents_content:
168
+ content_type, media_type = get_claude_media_type(doc["mimeType"])
169
+
170
+ if content_type == "document":
171
+ content.append({
172
+ "type": "document",
173
+ "source": {
174
+ "type": "base64",
175
+ "media_type": media_type,
176
+ "data": base64.b64encode(doc["content"]).decode("utf-8")
177
+ }
178
+ })
179
+
180
+ elif content_type == "image":
181
+ content.append({
182
+ "type": "image",
183
+ "source": {
184
+ "type": "base64",
185
+ "media_type": media_type,
186
+ "data": base64.b64encode(doc["content"]).decode("utf-8")
187
+ }
188
+ })
189
+
190
+ else:
191
+ try:
192
+ text_content = doc["content"].decode("utf-8")
193
+ content.append({
194
+ "type": "text",
195
+ "text": f"=== Document: {doc['fileName']} ===\n\n{text_content}\n\n=== End ==="
196
+ })
197
+ except UnicodeDecodeError:
198
+ continue
199
+
200
+ # Add query
201
+ content.append({
202
+ "type": "text",
203
+ "text": request.query
204
+ })
205
+
206
+ # Call Claude
207
+ response = claude_client.messages.create(
208
+ model="claude-sonnet-4-20250514",
209
+ max_tokens=4096,
210
+ system="""You are Dr. Gini, a research copilot for drug discovery and pharmaceutical research.
211
+
212
+ When answering:
213
+ - Be precise and cite specific sections when relevant
214
+ - If information is not in the documents, say so clearly
215
+ - For multiple documents, compare and synthesize across them
216
+ - Use scientific terminology appropriately
217
+ - Highlight key findings, methods, and limitations""",
218
+ messages=[{"role": "user", "content": content}]
219
+ )
220
+
221
+ return DocRAGResponse(
222
+ success=True,
223
+ query=request.query,
224
+ answer=response.content[0].text,
225
+ documentsUsed=documents_used
226
+ )
227
+
228
+ except anthropic.APIError as e:
229
+ return DocRAGResponse(
230
+ success=False,
231
+ query=request.query,
232
+ error=f"Claude API error: {str(e)}"
233
+ )
234
+ except Exception as e:
235
+ import traceback
236
+ traceback.print_exc()
237
+ return DocRAGResponse(
238
+ success=False,
239
+ query=request.query,
240
+ error=f"Error: {str(e)}"
241
+ )
242
+
243
+
244
+ @app.get("/health")
245
+ async def health_check():
246
+ return {"status": "healthy", "service": "Dr. Gini DocRAG"}
247
+
248
+
249
+ @app.get("/test-drive")
250
+ async def test_drive_connection():
251
+ """Test Google Drive connection"""
252
+ try:
253
+ drive_service = get_drive_service()
254
+ results = drive_service.files().list(
255
+ pageSize=5,
256
+ fields="files(id, name)"
257
+ ).execute()
258
+
259
+ files = results.get('files', [])
260
+ return {
261
+ "status": "connected",
262
+ "files_visible": len(files),
263
+ "sample_files": [f["name"] for f in files[:5]]
264
+ }
265
+ except Exception as e:
266
+ return {"status": "error", "error": str(e)}
267
+
268
+
269
+ if __name__ == "__main__":
270
+ import uvicorn
271
+ uvicorn.run(app, host="0.0.0.0", port=7860)