Files changed (4) hide show
  1. ap.py +584 -0
  2. ap1.py +0 -396
  3. app.py +163 -97
  4. requirements.txt +1 -2
ap.py ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import requests
4
+ import tempfile
5
+ from google.oauth2 import service_account
6
+ from googleapiclient.discovery import build
7
+ from googleapiclient.http import MediaIoBaseDownload
8
+ import openai
9
+ from dotenv import load_dotenv, dotenv_values
10
+ import io
11
+ import logging
12
+ from typing import List, Dict, Optional
13
+
14
+ # LangChain imports
15
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
16
+ from langchain_community.vectorstores import FAISS
17
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
18
+ from langchain_community.vectorstores import FAISS
19
+ from langchain.docstore.document import Document
20
+ from langchain.chains import RetrievalQA
21
+ from langchain.prompts import PromptTemplate
22
+ from langchain.memory import ConversationBufferMemory
23
+ from langchain.chains import ConversationalRetrievalChain
24
+ from langchain.schema import BaseRetriever
25
+ import pickle
26
+ import hashlib
27
+
28
+ from openai import OpenAI
29
+ openai.api_key = os.getenv('OPENAI_API_KEY')
30
+ openai = OpenAI(api_key=openai.api_key)
31
+
32
+ # Set up logging
33
+ logging.basicConfig(level=logging.INFO)
34
+ logger = logging.getLogger(__name__)
35
+
36
+ class EnhancedGPTDriveIntegration:
37
+ def __init__(self):
38
+ # Build credentials info from individual environment variables
39
+ credentials_info = {
40
+ "type": "service_account",
41
+ "project_id": os.getenv('GOOGLE_PROJECT_ID'),
42
+ "private_key_id": os.getenv('GOOGLE_PRIVATE_KEY_ID'),
43
+ "private_key": os.getenv('GOOGLE_PRIVATE_KEY').replace('\\n', '\n'),
44
+ "client_email": os.getenv('GOOGLE_CLIENT_EMAIL'),
45
+ "client_id": os.getenv('GOOGLE_CLIENT_ID'),
46
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
47
+ "token_uri": "https://oauth2.googleapis.com/token",
48
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
49
+ "client_x509_cert_url": os.getenv('GOOGLE_CLIENT_CERT_URL'),
50
+ "universe_domain": "googleapis.com"
51
+ }
52
+
53
+ # Check if all required fields are present
54
+ required_fields = ['project_id', 'private_key', 'client_email']
55
+ missing_fields = [field for field in required_fields if not credentials_info[field]]
56
+
57
+ if missing_fields:
58
+ raise ValueError(f"Missing required environment variables: {missing_fields}")
59
+
60
+ # Initialize Google Drive API
61
+ self.credentials = service_account.Credentials.from_service_account_info(
62
+ credentials_info,
63
+ scopes=['https://www.googleapis.com/auth/drive.readonly']
64
+ )
65
+
66
+ self.drive_service = build('drive', 'v3', credentials=self.credentials)
67
+
68
+ # Initialize OpenAI and LangChain components
69
+ openai.api_key = os.getenv('OPENAI_API_KEY')
70
+ self.embeddings = OpenAIEmbeddings()
71
+ self.llm = ChatOpenAI(temperature=0.7, model="gpt-3.5-turbo")
72
+
73
+ # Text splitter for better chunking
74
+ self.text_splitter = RecursiveCharacterTextSplitter(
75
+ chunk_size=1000,
76
+ chunk_overlap=200,
77
+ length_function=len,
78
+ separators=["\n\n", "\n", " ", ""]
79
+ )
80
+
81
+ # Initialize vector store
82
+ self.vector_store = None
83
+ self.conversation_memory = ConversationBufferMemory(
84
+ memory_key="chat_history",
85
+ return_messages=True
86
+ )
87
+
88
+ # Cache for processed files
89
+ self.processed_files = {}
90
+ self.cache_file = "processed_files_cache.pkl"
91
+ self.load_cache()
92
+
93
+ def load_cache(self):
94
+ """Load processed files cache"""
95
+ try:
96
+ if os.path.exists(self.cache_file):
97
+ with open(self.cache_file, 'rb') as f:
98
+ self.processed_files = pickle.load(f)
99
+ logger.info(f"Loaded cache with {len(self.processed_files)} files")
100
+ except Exception as e:
101
+ logger.error(f"Error loading cache: {e}")
102
+ self.processed_files = {}
103
+
104
+ def save_cache(self):
105
+ """Save processed files cache"""
106
+ try:
107
+ with open(self.cache_file, 'wb') as f:
108
+ pickle.dump(self.processed_files, f)
109
+ logger.info("Cache saved successfully")
110
+ except Exception as e:
111
+ logger.error(f"Error saving cache: {e}")
112
+
113
+ def get_file_hash(self, file_id: str, file_size: str) -> str:
114
+ """Generate hash for file to check if it's been processed"""
115
+ return hashlib.md5(f"{file_id}_{file_size}".encode()).hexdigest()
116
+
117
+ def search_files(self, query: str, file_types: Optional[List[str]] = None) -> List[Dict]:
118
+ """Search for files in Google Drive with improved query handling"""
119
+ # Build more sophisticated search query
120
+ search_terms = query.lower().split()
121
+ search_queries = []
122
+
123
+ # Search in file names and content
124
+ for term in search_terms:
125
+ search_queries.append(f"name contains '{term}' or fullText contains '{term}'")
126
+
127
+ search_query = " and ".join([f"({sq})" for sq in search_queries])
128
+
129
+ if file_types:
130
+ type_queries = []
131
+ for file_type in file_types:
132
+ if file_type.lower() == 'pdf':
133
+ type_queries.append("mimeType='application/pdf'")
134
+ elif file_type.lower() in ['doc', 'docx']:
135
+ type_queries.append("mimeType contains 'document'")
136
+ elif file_type.lower() in ['xls', 'xlsx']:
137
+ type_queries.append("mimeType contains 'spreadsheet'")
138
+ elif file_type.lower() == 'txt':
139
+ type_queries.append("mimeType='text/plain'")
140
+
141
+ if type_queries:
142
+ search_query += f" and ({' or '.join(type_queries)})"
143
+
144
+ try:
145
+ results = self.drive_service.files().list(
146
+ q=search_query,
147
+ fields="files(id, name, mimeType, size, modifiedTime)",
148
+ pageSize=20 # Increased to get more results
149
+ ).execute()
150
+
151
+ files = results.get('files', [])
152
+ logger.info(f"Found {len(files)} files matching query: {query}")
153
+ return files
154
+
155
+ except Exception as e:
156
+ logger.error(f"Error searching files: {e}")
157
+ return []
158
+
159
+ def get_file_content(self, file_id: str, mime_type: str) -> str:
160
+ """Download and extract text content from file with better error handling"""
161
+ try:
162
+ if 'text' in mime_type or 'document' in mime_type:
163
+ if 'document' in mime_type:
164
+ request = self.drive_service.files().export_media(
165
+ fileId=file_id, mimeType='text/plain'
166
+ )
167
+ else:
168
+ request = self.drive_service.files().get_media(fileId=file_id)
169
+
170
+ file_content = io.BytesIO()
171
+ downloader = MediaIoBaseDownload(file_content, request)
172
+ done = False
173
+ while done is False:
174
+ status, done = downloader.next_chunk()
175
+
176
+ return file_content.getvalue().decode('utf-8', errors='ignore')
177
+
178
+ elif 'spreadsheet' in mime_type:
179
+ request = self.drive_service.files().export_media(
180
+ fileId=file_id, mimeType='text/csv'
181
+ )
182
+ file_content = io.BytesIO()
183
+ downloader = MediaIoBaseDownload(file_content, request)
184
+ done = False
185
+ while done is False:
186
+ status, done = downloader.next_chunk()
187
+
188
+ return file_content.getvalue().decode('utf-8', errors='ignore')
189
+
190
+ elif mime_type == 'application/pdf':
191
+ request = self.drive_service.files().get_media(fileId=file_id)
192
+ file_content = io.BytesIO()
193
+ downloader = MediaIoBaseDownload(file_content, request)
194
+ done = False
195
+ while done is False:
196
+ status, done = downloader.next_chunk()
197
+
198
+ file_content.seek(0)
199
+
200
+ try:
201
+ import PyPDF2
202
+ pdf_reader = PyPDF2.PdfReader(file_content)
203
+ text = ""
204
+ for page in pdf_reader.pages:
205
+ text += page.extract_text() + "\n"
206
+ return text
207
+ except ImportError:
208
+ logger.warning("PyPDF2 not available, trying alternative PDF extraction")
209
+ # Try alternative PDF extraction
210
+ try:
211
+ import pdfplumber
212
+ with pdfplumber.open(file_content) as pdf:
213
+ text = ""
214
+ for page in pdf.pages:
215
+ text += page.extract_text() + "\n"
216
+ return text
217
+ except ImportError:
218
+ return "PDF text extraction requires PyPDF2 or pdfplumber library"
219
+ except Exception as e:
220
+ return f"Error extracting PDF text: {str(e)}"
221
+
222
+ else:
223
+ return "File type not supported for text extraction"
224
+
225
+ except Exception as e:
226
+ logger.error(f"Error reading file {file_id}: {e}")
227
+ return f"Error reading file: {str(e)}"
228
+
229
+ def process_documents_to_vector_store(self, files: List[Dict]) -> None:
230
+ """Process documents and create/update vector store"""
231
+ documents = []
232
+ new_files_processed = 0
233
+
234
+ for file in files:
235
+ file_hash = self.get_file_hash(file['id'], file.get('size', '0'))
236
+
237
+ # Check if file is already processed and hasn't changed
238
+ if file_hash in self.processed_files:
239
+ # Load cached documents
240
+ cached_docs = self.processed_files[file_hash]
241
+ documents.extend(cached_docs)
242
+ continue
243
+
244
+ # Process new or changed file
245
+ content = self.get_file_content(file['id'], file['mimeType'])
246
+
247
+ if content and not content.startswith('Error'):
248
+ # Split content into chunks
249
+ chunks = self.text_splitter.split_text(content)
250
+
251
+ # Create Document objects with metadata
252
+ file_documents = []
253
+ for i, chunk in enumerate(chunks):
254
+ doc = Document(
255
+ page_content=chunk,
256
+ metadata={
257
+ 'source': file['name'],
258
+ 'file_id': file['id'],
259
+ 'chunk_id': i,
260
+ 'mime_type': file['mimeType'],
261
+ 'total_chunks': len(chunks)
262
+ }
263
+ )
264
+ file_documents.append(doc)
265
+
266
+ documents.extend(file_documents)
267
+
268
+ # Cache the processed documents
269
+ self.processed_files[file_hash] = file_documents
270
+ new_files_processed += 1
271
+
272
+ logger.info(f"Processed file: {file['name']} ({len(chunks)} chunks)")
273
+
274
+ if new_files_processed > 0:
275
+ self.save_cache()
276
+ logger.info(f"Processed {new_files_processed} new files")
277
+
278
+ # Create or update vector store
279
+ if documents:
280
+ if self.vector_store is None:
281
+ self.vector_store = FAISS.from_documents(documents, self.embeddings)
282
+ logger.info(f"Created new vector store with {len(documents)} documents")
283
+ else:
284
+ # Add new documents to existing vector store
285
+ new_docs = [doc for file_docs in self.processed_files.values()
286
+ for doc in file_docs if doc not in documents]
287
+ if new_docs:
288
+ self.vector_store.add_documents(new_docs)
289
+ logger.info(f"Added {len(new_docs)} new documents to vector store")
290
+
291
+ def create_conversational_chain(self) -> ConversationalRetrievalChain:
292
+ """Create a conversational retrieval chain"""
293
+ if self.vector_store is None:
294
+ raise ValueError("Vector store not initialized. Process documents first.")
295
+
296
+ # Create custom prompt template
297
+ prompt_template = """You are Study Buddy, an AI assistant specialized in helping students study anatomy effectively.
298
+ Use the following context from the student's study materials to answer their question.
299
+
300
+ Context: {context}
301
+
302
+ Question: {question}
303
+
304
+ Instructions:
305
+ 1. Answer the question directly and comprehensively using the provided context
306
+ 2. If the context doesn't contain enough information, say so clearly
307
+ 3. Provide study tips or exam strategies when relevant
308
+ 4. Use clear, educational language appropriate for students
309
+ 5. Always end your response with "Is there anything else I can help you with?"
310
+
311
+ Answer:"""
312
+
313
+ PROMPT = PromptTemplate(
314
+ template=prompt_template,
315
+ input_variables=["context", "question"]
316
+ )
317
+
318
+ # Create retrieval chain
319
+ qa_chain = ConversationalRetrievalChain.from_llm(
320
+ llm=self.llm,
321
+ retriever=self.vector_store.as_retriever(
322
+ search_type="similarity",
323
+ search_kwargs={"k": 6} # Retrieve top 6 relevant chunks
324
+ ),
325
+ memory=self.conversation_memory,
326
+ combine_docs_chain_kwargs={"prompt": PROMPT},
327
+ return_source_documents=True,
328
+ verbose=True
329
+ )
330
+
331
+ return qa_chain
332
+
333
+ def process_query(self, user_query: str, search_terms: Optional[List[str]] = None) -> Dict:
334
+ """Enhanced query processing with LangChain"""
335
+ try:
336
+ # Extract search terms from query if not provided
337
+ if not search_terms:
338
+ search_terms = user_query.lower().split()[:5] # Take first 5 words
339
+
340
+ # Search for relevant files
341
+ all_files = []
342
+ for term in search_terms:
343
+ files = self.search_files(term)
344
+ all_files.extend(files)
345
+
346
+ # Remove duplicates while preserving order
347
+ unique_files = []
348
+ seen_ids = set()
349
+ for file in all_files:
350
+ if file['id'] not in seen_ids:
351
+ unique_files.append(file)
352
+ seen_ids.add(file['id'])
353
+
354
+ if not unique_files:
355
+ return {
356
+ 'answer': "No relevant files found in your Google Drive for this query. Please check if you have uploaded study materials related to your question.",
357
+ 'sources': [],
358
+ 'confidence': 'low'
359
+ }
360
+
361
+ # Process documents and create vector store
362
+ self.process_documents_to_vector_store(unique_files[:10]) # Process top 10 files
363
+
364
+ if self.vector_store is None:
365
+ return {
366
+ 'answer': "Unable to process the documents. Please check if the files contain readable text content.",
367
+ 'sources': [],
368
+ 'confidence': 'low'
369
+ }
370
+
371
+ # Create conversational chain and get answer
372
+ qa_chain = self.create_conversational_chain()
373
+
374
+ # Query the chain
375
+ result = qa_chain({"question": user_query})
376
+
377
+ # Extract source documents
378
+ source_docs = result.get('source_documents', [])
379
+ sources = list(set([doc.metadata['source'] for doc in source_docs]))
380
+
381
+ # Calculate confidence based on source document relevance
382
+ confidence = 'high' if len(source_docs) >= 3 else 'medium' if len(source_docs) >= 1 else 'low'
383
+
384
+ return {
385
+ 'answer': result['answer'],
386
+ 'sources': sources,
387
+ 'confidence': confidence,
388
+ 'total_files_searched': len(unique_files),
389
+ 'chunks_retrieved': len(source_docs)
390
+ }
391
+
392
+ except Exception as e:
393
+ logger.error(f"Error processing query: {e}")
394
+ return {
395
+ 'answer': f"An error occurred while processing your query: {str(e)}. Please try again or rephrase your question.",
396
+ 'sources': [],
397
+ 'confidence': 'low'
398
+ }
399
+
400
+ def clear_memory(self):
401
+ """Clear conversation memory"""
402
+ self.conversation_memory.clear()
403
+ logger.info("Conversation memory cleared")
404
+
405
+ def get_vector_store_stats(self) -> Dict:
406
+ """Get statistics about the vector store"""
407
+ if self.vector_store is None:
408
+ return {"total_documents": 0, "total_files": 0}
409
+
410
+ try:
411
+ total_docs = len(self.vector_store.docstore._dict)
412
+ total_files = len(set([doc.metadata.get('source', 'Unknown')
413
+ for doc in self.vector_store.docstore._dict.values()]))
414
+
415
+ return {
416
+ "total_documents": total_docs,
417
+ "total_files": total_files,
418
+ "cache_size": len(self.processed_files)
419
+ }
420
+ except:
421
+ return {"total_documents": "Unknown", "total_files": "Unknown"}
422
+
423
+ # Initialize the enhanced system
424
+ enhanced_gpt_drive = EnhancedGPTDriveIntegration()
425
+
426
+ def process_user_query(query: str, search_terms_input: str) -> tuple:
427
+ """Process user query and return formatted response"""
428
+ if not query.strip():
429
+ return "Please enter a question.", "", ""
430
+
431
+ # Parse search terms if provided
432
+ search_terms = None
433
+ if search_terms_input.strip():
434
+ search_terms = [term.strip() for term in search_terms_input.split(',')]
435
+
436
+ # Process the query
437
+ result = enhanced_gpt_drive.process_query(query, search_terms)
438
+
439
+ # Format the response
440
+ answer = result['answer']
441
+ sources = result['sources']
442
+
443
+ # Create detailed sources text
444
+ sources_text = ""
445
+ if sources:
446
+ sources_text = "**Sources used:**\n" + "\n".join([f"β€’ {source}" for source in sources])
447
+ sources_text += f"\n\n**Search Details:**\n"
448
+ sources_text += f"β€’ Files searched: {result.get('total_files_searched', 0)}\n"
449
+ sources_text += f"β€’ Relevant chunks found: {result.get('chunks_retrieved', 0)}\n"
450
+ sources_text += f"β€’ Confidence: {result.get('confidence', 'unknown').title()}"
451
+
452
+ # Stats for display
453
+ stats = enhanced_gpt_drive.get_vector_store_stats()
454
+ stats_text = f"**Knowledge Base:** {stats['total_documents']} chunks from {stats['total_files']} files"
455
+
456
+ return answer, sources_text, stats_text
457
+
458
+ def clear_conversation():
459
+ """Clear conversation memory"""
460
+ enhanced_gpt_drive.clear_memory()
461
+ return "Conversation history cleared. You can start a fresh conversation now."
462
+
463
+ def get_system_status():
464
+ """Get system status information"""
465
+ stats = enhanced_gpt_drive.get_vector_store_stats()
466
+
467
+ status_lines = [
468
+ "βœ… Google Drive API: Connected",
469
+ "βœ… OpenAI API: Connected",
470
+ "βœ… LangChain: Initialized",
471
+ f"πŸ“š Knowledge Base: {stats['total_documents']} document chunks",
472
+ f"πŸ“ Processed Files: {stats['total_files']} files",
473
+ f"πŸ’Ύ Cache Size: {stats['cache_size']} entries"
474
+ ]
475
+
476
+ return "\n".join(status_lines)
477
+
478
+ # Create enhanced Gradio interface
479
+ import gradio as gr
480
+
481
+ with gr.Blocks(title="Enhanced Study Buddy", theme=gr.themes.Soft()) as app:
482
+ gr.Markdown("# 🧠 Enhanced Anatomy Study Buddy with LangChain")
483
+ gr.Markdown("Study more effectively with advanced AI-powered document analysis and conversational memory!")
484
+
485
+ with gr.Row():
486
+ with gr.Column(scale=3):
487
+ # Main query interface
488
+ with gr.Group():
489
+ gr.Markdown("### πŸ’¬ Ask a Question")
490
+ query_input = gr.Textbox(
491
+ label="Your Question",
492
+ placeholder="Ask me anything about your anatomy study materials...",
493
+ lines=3
494
+ )
495
+
496
+ search_terms_input = gr.Textbox(
497
+ label="πŸ” Search Terms (Optional)",
498
+ placeholder="Enter comma-separated terms to focus the search",
499
+ lines=1
500
+ )
501
+
502
+ with gr.Row():
503
+ submit_btn = gr.Button("πŸš€ Search & Ask", variant="primary", size="lg")
504
+ clear_btn = gr.Button("🧹 Clear Memory", variant="secondary")
505
+
506
+ # Results section
507
+ with gr.Group():
508
+ gr.Markdown("### 🎯 Answer")
509
+ answer_output = gr.Textbox(
510
+ label="AI Response",
511
+ lines=12,
512
+ interactive=False
513
+ )
514
+
515
+ sources_output = gr.Textbox(
516
+ label="πŸ“š Sources & Details",
517
+ lines=6,
518
+ interactive=False
519
+ )
520
+
521
+ with gr.Column(scale=1):
522
+ # System info
523
+ with gr.Group():
524
+ gr.Markdown("### πŸ“Š System Status")
525
+ status_btn = gr.Button("πŸ”„ Refresh Status", size="sm")
526
+ status_output = gr.Textbox(
527
+ label="System Information",
528
+ lines=8,
529
+ interactive=False
530
+ )
531
+
532
+ stats_output = gr.Textbox(
533
+ label="Knowledge Base",
534
+ lines=2,
535
+ interactive=False
536
+ )
537
+
538
+ # Event handlers
539
+ submit_btn.click(
540
+ fn=process_user_query,
541
+ inputs=[query_input, search_terms_input],
542
+ outputs=[answer_output, sources_output, stats_output]
543
+ )
544
+
545
+ clear_btn.click(
546
+ fn=clear_conversation,
547
+ outputs=answer_output
548
+ )
549
+
550
+ status_btn.click(
551
+ fn=get_system_status,
552
+ outputs=status_output
553
+ )
554
+
555
+ # Enhanced examples
556
+ with gr.Row():
557
+ gr.Examples(
558
+ examples=[
559
+ ["What is morbid anatomy and how does it relate to pathology?", "morbid, anatomy, pathology"],
560
+ ["Explain the neural transmission process between neurons", "neuron, transmission, synaptic"],
561
+ ["Describe the complete anatomy of the external ear", "external ear, anatomy, auditory"],
562
+ ["What are the different types of therapeutic massage?", "massage, therapy, treatment"],
563
+ ["Define trauma and its classification in medical terms", "trauma, medical, classification"],
564
+ ["Explain upper limb prosthetics and their applications", "prosthetics, upper limb, rehabilitation"],
565
+ ["How does the nervous system control muscle movement?", "nervous system, muscle, motor control"],
566
+ ["What are the key anatomical landmarks for injection sites?", "injection sites, anatomical landmarks"]
567
+ ],
568
+ inputs=[query_input, search_terms_input]
569
+ )
570
+
571
+ # Initial status load
572
+ app.load(
573
+ fn=get_system_status,
574
+ outputs=status_output
575
+ )
576
+
577
+ # Launch the enhanced app
578
+ if __name__ == "__main__":
579
+ app.launch(
580
+ share=True,
581
+ debug=True,
582
+ server_name="0.0.0.0",
583
+ server_port=7860
584
+ )
ap1.py DELETED
@@ -1,396 +0,0 @@
1
- import os
2
- import json
3
- import requests
4
- import json
5
- import tempfile
6
- from google.oauth2 import service_account
7
- from googleapiclient.discovery import build
8
- from googleapiclient.http import MediaIoBaseDownload
9
- import openai
10
- from dotenv import load_dotenv, dotenv_values
11
- import io
12
-
13
- from openai import OpenAI
14
- openai.api_key = os.getenv('OPENAI_API_KEY')
15
- openai = OpenAI(api_key = openai.api_key)
16
-
17
-
18
-
19
-
20
- class GPTDriveIntegration:
21
- def __init__(self):
22
- # Build credentials info from individual environment variables
23
- credentials_info = {
24
- "type": "service_account",
25
- "project_id": os.getenv('GOOGLE_PROJECT_ID'),
26
- "private_key_id": os.getenv('GOOGLE_PRIVATE_KEY_ID'),
27
- "private_key": os.getenv('GOOGLE_PRIVATE_KEY').replace('\\n', '\n'), # Fix line breaks
28
- "client_email": os.getenv('GOOGLE_CLIENT_EMAIL'),
29
- "client_id": os.getenv('GOOGLE_CLIENT_ID'),
30
- "auth_uri": "https://accounts.google.com/o/oauth2/auth",
31
- "token_uri": "https://oauth2.googleapis.com/token",
32
- "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
33
- "client_x509_cert_url": os.getenv('GOOGLE_CLIENT_CERT_URL'),
34
- "universe_domain": "googleapis.com"
35
- }
36
-
37
- # Check if all required fields are present
38
- required_fields = ['project_id', 'private_key', 'client_email']
39
- missing_fields = [field for field in required_fields if not credentials_info[field]]
40
-
41
- if missing_fields:
42
- raise ValueError(f"Missing required environment variables: {missing_fields}")
43
-
44
- # Initialize Google Drive API
45
- self.credentials = service_account.Credentials.from_service_account_info(
46
- credentials_info,
47
- scopes=['https://www.googleapis.com/auth/drive.readonly']
48
- )
49
-
50
- self.drive_service = build('drive', 'v3', credentials=self.credentials)
51
- # Initialize OpenAI
52
- openai.api_key = os.getenv('OPENAI_API_KEY')
53
-
54
- def search_files(self, query, file_types=None):
55
- """Search for files in Google Drive"""
56
- search_query = f"name contains '{query}'"
57
-
58
- if file_types:
59
- type_queries = []
60
- for file_type in file_types:
61
- if file_type.lower() == 'pdf':
62
- type_queries.append("mimeType='application/pdf'")
63
- elif file_type.lower() in ['doc', 'docx']:
64
- type_queries.append("mimeType contains 'document'")
65
- elif file_type.lower() in ['xls', 'xlsx']:
66
- type_queries.append("mimeType contains 'spreadsheet'")
67
-
68
- if type_queries:
69
- search_query += f" and ({' or '.join(type_queries)})"
70
-
71
- results = self.drive_service.files().list(
72
- q=search_query,
73
- fields="files(id, name, mimeType, size)"
74
- ).execute()
75
-
76
- return results.get('files', [])
77
-
78
- def get_file_content(self, file_id, mime_type):
79
- """Download and extract text content from file"""
80
- try:
81
- if 'text' in mime_type or 'document' in mime_type:
82
- # For Google Docs, export as plain text
83
- if 'document' in mime_type:
84
- request = self.drive_service.files().export_media(
85
- fileId=file_id, mimeType='text/plain'
86
- )
87
- else:
88
- request = self.drive_service.files().get_media(fileId=file_id)
89
-
90
- file_content = io.BytesIO()
91
- downloader = MediaIoBaseDownload(file_content, request)
92
- done = False
93
- while done is False:
94
- status, done = downloader.next_chunk()
95
-
96
- return file_content.getvalue().decode('utf-8')
97
-
98
- elif 'spreadsheet' in mime_type:
99
- # For Google Sheets, export as CSV
100
- request = self.drive_service.files().export_media(
101
- fileId=file_id, mimeType='text/csv'
102
- )
103
- file_content = io.BytesIO()
104
- downloader = MediaIoBaseDownload(file_content, request)
105
- done = False
106
- while done is False:
107
- status, done = downloader.next_chunk()
108
-
109
- return file_content.getvalue().decode('utf-8')
110
-
111
- elif mime_type == 'application/pdf':
112
- # For PDF files, download binary content and extract text
113
- request = self.drive_service.files().get_media(fileId=file_id)
114
- file_content = io.BytesIO()
115
- downloader = MediaIoBaseDownload(file_content, request)
116
- done = False
117
- while done is False:
118
- status, done = downloader.next_chunk()
119
-
120
- # Extract text from PDF
121
- file_content.seek(0)
122
-
123
- try:
124
- import PyPDF2
125
- pdf_reader = PyPDF2.PdfReader(file_content)
126
- text = ""
127
- for page in pdf_reader.pages:
128
- text += page.extract_text() + "\n"
129
- return text
130
- except ImportError:
131
- return "PDF text extraction requires PyPDF2 library"
132
-
133
- else:
134
- return "File type not supported for text extraction"
135
-
136
- except Exception as e:
137
- return f"Error reading file: {str(e)}"
138
-
139
- def query_gpt_with_context(self, user_query, file_contents):
140
- """Send query to GPT with file context"""
141
- context = "\n\n".join([
142
- f"File: {content['name']}\nContent: {content['text'][:2000]}..."
143
- for content in file_contents
144
- ])
145
-
146
- messages = [
147
- {
148
- "role": "system",
149
- "content": """
150
- You are an AI assistant that can analyze documents from Google Drive.
151
- Use the provided file contents to answer user questions."""
152
- },
153
- {
154
- "role": "user",
155
- "content": f"Context from Google Drive files:\n{context}\n\nUser Question: {user_query}"
156
- }
157
- ]
158
-
159
- response = openai.chat.completions.create(
160
- model="gpt-4o-mini",
161
- messages=messages,
162
- max_tokens=1000
163
- )
164
-
165
- return response.choices[0].message.content
166
-
167
- def process_query(self, user_query, search_terms=None):
168
- """Main function to process user queries"""
169
- # Extract search terms from query if not provided
170
- if not search_terms:
171
- search_terms = user_query.split()[:3] # Simple extraction
172
-
173
- # Search for relevant files
174
- files = []
175
- for term in search_terms:
176
- files.extend(self.search_files(term))
177
-
178
- # Remove duplicates
179
- unique_files = {f['id']: f for f in files}.values()
180
-
181
- # Get content from top 3 most relevant files
182
- file_contents = []
183
- for file in list(unique_files)[:3]:
184
- content = self.get_file_content(file['id'], file['mimeType'])
185
- file_contents.append({
186
- 'name': file['name'],
187
- 'text': content
188
- })
189
-
190
- # Query GPT with context
191
- if file_contents:
192
- response = self.query_gpt_with_context(user_query, file_contents)
193
- return {
194
- 'answer': response,
195
- 'sources': [f['name'] for f in file_contents]
196
- }
197
- else:
198
- return {
199
- 'answer': "No relevant files found in your Google Drive.",
200
- 'sources': []
201
- }
202
-
203
- def query_gpt_with_context(self, user_query, file_contents):
204
- """Send query to GPT with file context"""
205
- context = "\n\n".join([
206
- f"File: {content['name']}\nContent: {content['text'][:2000]}..."
207
- for content in file_contents
208
- ])
209
-
210
- messages = [
211
- {
212
- "role": "system",
213
- "content": """
214
- You are an AI assistant that can analyze documents from Google Drive.
215
- Use the provided file contents to answer user questions.
216
- Answer directly and add additional suggestions on how to answer questions in the exam
217
- Always end with 'Is there anything I can hel you with?'
218
- Your name is Study buddy, happy to help students study more effectively
219
-
220
- """
221
- },
222
- {
223
- "role": "user",
224
- "content": f"Context from Google Drive files:\n{context}\n\nUser Question: {user_query}"
225
- }
226
- ]
227
-
228
- response = openai.chat.completions.create(
229
- model="gpt-4o-mini",
230
- messages=messages,
231
- max_tokens=1000
232
- )
233
-
234
- return response.choices[0].message.content
235
-
236
- def process_query(self, user_query, search_terms=None):
237
- """Main function to process user queries"""
238
- # Extract search terms from query if not provided
239
- if not search_terms:
240
- search_terms = user_query.split()[:3] # Simple extraction
241
-
242
- # Search for relevant files
243
- files = []
244
- for term in search_terms:
245
- files.extend(self.search_files(term))
246
-
247
- # Remove duplicates
248
- unique_files = {f['id']: f for f in files}.values()
249
-
250
- # Get content from top 3 most relevant files
251
- file_contents = []
252
- for file in list(unique_files)[:3]:
253
- content = self.get_file_content(file['id'], file['mimeType'])
254
- file_contents.append({
255
- 'name': file['name'],
256
- 'text': content
257
- })
258
-
259
- # Query GPT with context
260
- if file_contents:
261
- response = self.query_gpt_with_context(user_query, file_contents)
262
- return {
263
- 'answer': response,
264
- 'sources': [f['name'] for f in file_contents]
265
- }
266
- else:
267
- return {
268
- 'answer': "No relevant files found in your Google Drive.",
269
- 'sources': []
270
- }
271
-
272
- gpt_drive = GPTDriveIntegration()
273
-
274
- def process_user_query(query, search_terms_input):
275
- """Process user query and return formatted response"""
276
- if not query.strip():
277
- return "Please enter a question.", ""
278
-
279
- # Parse search terms if provided
280
- search_terms = None
281
- # if search_terms_input.strip():
282
- # search_terms = [term.strip() for term in search_terms_input.split(',')]
283
-
284
- # Process the query
285
- result = gpt_drive.process_query(query, search_terms)
286
-
287
- # Format the response
288
- answer = result['answer']
289
- sources = result['sources']
290
-
291
- sources_text = ""
292
- if sources:
293
- sources_text = "**Sources used:**\n" + "\n".join([f"β€’ {source}" for source in sources])
294
-
295
- return answer, sources_text
296
-
297
- def check_setup():
298
- """Check if the APIs are properly configured"""
299
- status_messages = []
300
-
301
- # Check Google Drive API
302
- if gpt_drive.drive_initialized:
303
- status_messages.append("βœ… Google Drive API: Connected")
304
- else:
305
- status_messages.append(f"❌ Google Drive API: {getattr(gpt_drive, 'drive_error', 'Not configured')}")
306
-
307
- # Check OpenAI API
308
- if gpt_drive.openai_initialized:
309
- status_messages.append("βœ… OpenAI API: Connected")
310
- else:
311
- status_messages.append(f"❌ OpenAI API: {getattr(gpt_drive, 'openai_error', 'Not configured')}")
312
-
313
- return "\n".join(status_messages)
314
-
315
- # Create Gradio interface
316
- import gradio as gr
317
- with gr.Blocks(title="Study Buddy", theme=gr.themes.Soft()) as app:
318
- gr.Markdown("# Anatomy Study Buddy ")
319
- gr.Markdown("Study more effectively with study Buddy!")
320
-
321
- with gr.Row():
322
- with gr.Column(scale=2):
323
- # Main query interface
324
- with gr.Group():
325
- gr.Markdown("### Ask a Question")
326
- query_input = gr.Textbox(
327
- label="Your Question",
328
- placeholder="Ask me any question about your anatomy books?",
329
- lines=3
330
- )
331
-
332
- search_terms_input = gr.Textbox(
333
- label="Search Terms",
334
- placeholder="Enter comma-separated terms to search for specific files",
335
- lines=1
336
- )
337
-
338
- submit_btn = gr.Button("Search & Ask", variant="primary", size="lg")
339
-
340
- # Results section
341
- with gr.Group():
342
- gr.Markdown("### Answer")
343
- answer_output = gr.Textbox(
344
- label="AI Response",
345
- lines=10,
346
- interactive=False
347
- )
348
-
349
- sources_output = gr.Textbox(
350
- label="Sources",
351
- lines=3,
352
- interactive=False
353
- )
354
-
355
- # with gr.Column(scale=1):
356
- # # Status and setup info
357
- # with gr.Group():
358
- # gr.Markdown("### System Status")
359
- # status_btn = gr.Button("Check Status", size="sm")
360
- # status_output = gr.Textbox(
361
- # label="API Status",
362
- # lines=4,
363
- # interactive=False
364
- # )
365
-
366
-
367
- # Event handlers
368
- submit_btn.click(
369
- fn=process_user_query,
370
- inputs=[query_input, search_terms_input],
371
- outputs=[answer_output, sources_output]
372
- )
373
-
374
- # status_btn.click(
375
- # fn=check_setup,
376
- # outputs=status_output
377
- # )
378
-
379
- # Example queries
380
- with gr.Row():
381
- gr.Examples(
382
- examples=[
383
- ["What is morbid Anatomy?", "morbid, Anatomy"],
384
- ["The transmission of nerves from one neuron to another is as a result of what?", "neuron, nerves, Dr Clement"],
385
- ["Explain what the external ear contains of?", "Ear Anatomy, Ear"],
386
- ["What are the types of massage?", "massage Lecture, nerves"],
387
- ["What is trauma?", "Trauma, pysical trauma and sex Offenders"],
388
- ["what is Upper limb prosthetics?", "Upper limb prosthetics"],
389
- ],
390
- inputs=[query_input, search_terms_input],)
391
-
392
-
393
- # Launch the app
394
- if __name__ == "__main__":
395
- app.launch(
396
- share=True,debug =True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import json
3
  import requests
 
4
  import tempfile
5
  from google.oauth2 import service_account
6
  from googleapiclient.discovery import build
@@ -8,12 +9,14 @@ from googleapiclient.http import MediaIoBaseDownload
8
  import openai
9
  from dotenv import load_dotenv, dotenv_values
10
  import io
11
- from markitdown import MarkItDown
12
 
13
  from openai import OpenAI
14
  openai.api_key = os.getenv('OPENAI_API_KEY')
15
  openai = OpenAI(api_key = openai.api_key)
16
 
 
 
 
17
  class GPTDriveIntegration:
18
  def __init__(self):
19
  # Build credentials info from individual environment variables
@@ -45,10 +48,6 @@ class GPTDriveIntegration:
45
  )
46
 
47
  self.drive_service = build('drive', 'v3', credentials=self.credentials)
48
-
49
- # Initialize MarkItDown
50
- self.md = MarkItDown()
51
-
52
  # Initialize OpenAI
53
  openai.api_key = os.getenv('OPENAI_API_KEY')
54
 
@@ -59,17 +58,12 @@ class GPTDriveIntegration:
59
  if file_types:
60
  type_queries = []
61
  for file_type in file_types:
62
- ext = file_type.lower().lstrip('.')
63
- if ext == 'pdf':
64
  type_queries.append("mimeType='application/pdf'")
65
- elif ext in ['doc', 'docx']:
66
  type_queries.append("mimeType contains 'document'")
67
- elif ext in ['xls', 'xlsx']:
68
  type_queries.append("mimeType contains 'spreadsheet'")
69
- elif ext in ['ppt', 'pptx']:
70
- type_queries.append("mimeType contains 'presentation'")
71
- elif ext in ['txt', 'md', 'markdown']:
72
- type_queries.append("mimeType='text/plain'")
73
 
74
  if type_queries:
75
  search_query += f" and ({' or '.join(type_queries)})"
@@ -81,82 +75,135 @@ class GPTDriveIntegration:
81
 
82
  return results.get('files', [])
83
 
84
- def get_file_content(self, file_id, file_name, mime_type):
85
- """Download and extract content from file using MarkItDown"""
86
  try:
87
- # Handle Google Workspace files - export to appropriate format for MarkItDown
88
- if 'document' in mime_type:
89
- # Export Google Docs as DOCX for better formatting preservation
90
- request = self.drive_service.files().export_media(
91
- fileId=file_id,
92
- mimeType='application/vnd.openxmlformats-officedocument.wordprocessingml.document'
93
- )
94
- file_extension = 'docx'
 
 
 
 
 
 
 
 
 
95
  elif 'spreadsheet' in mime_type:
96
- # Export Google Sheets as XLSX
97
  request = self.drive_service.files().export_media(
98
- fileId=file_id,
99
- mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
100
  )
101
- file_extension = 'xlsx'
102
- elif 'presentation' in mime_type:
103
- # Export Google Slides as PPTX
104
- request = self.drive_service.files().export_media(
105
- fileId=file_id,
106
- mimeType='application/vnd.openxmlformats-officedocument.presentationml.presentation'
107
- )
108
- file_extension = 'pptx'
109
- else:
110
- # For regular files, download as-is
111
- request = self.drive_service.files().get_media(fileId=file_id)
112
- file_extension = self._get_extension_from_name_or_mime(file_name, mime_type)
113
-
114
- # Download file content
115
- file_content = io.BytesIO()
116
- downloader = MediaIoBaseDownload(file_content, request)
117
- done = False
118
- while done is False:
119
- status, done = downloader.next_chunk()
120
-
121
- # Reset stream position
122
- file_content.seek(0)
123
-
124
- # Use MarkItDown to convert to markdown
125
- result = self.md.convert_stream(file_content, file_extension=file_extension)
126
 
127
- return result.text_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
 
 
 
129
  except Exception as e:
130
- return f"Error processing file with MarkItDown: {str(e)}"
131
 
132
- def _get_extension_from_name_or_mime(self, file_name, mime_type):
133
- """Helper to determine file extension for MarkItDown"""
134
- # First try to get extension from filename
135
- if '.' in file_name:
136
- return file_name.split('.')[-1].lower()
 
137
 
138
- # Fallback to mime type mapping
139
- mime_to_ext = {
140
- 'application/pdf': 'pdf',
141
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
142
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
143
- 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
144
- 'application/msword': 'doc',
145
- 'application/vnd.ms-excel': 'xls',
146
- 'application/vnd.ms-powerpoint': 'ppt',
147
- 'text/plain': 'txt',
148
- 'text/markdown': 'md',
149
- 'text/html': 'html',
150
- 'application/json': 'json',
151
- 'text/csv': 'csv'
152
- }
153
 
154
- return mime_to_ext.get(mime_type, 'txt')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  def query_gpt_with_context(self, user_query, file_contents):
157
  """Send query to GPT with file context"""
158
  context = "\n\n".join([
159
- f"File: {content['name']}\nContent: {content['text'][:3000]}..."
160
  for content in file_contents
161
  ])
162
 
@@ -167,8 +214,9 @@ class GPTDriveIntegration:
167
  You are an AI assistant that can analyze documents from Google Drive.
168
  Use the provided file contents to answer user questions.
169
  Answer directly and add additional suggestions on how to answer questions in the exam
170
- Always end with 'Is there anything I can help you with?'
171
  Your name is Study buddy, happy to help students study more effectively
 
172
  """
173
  },
174
  {
@@ -202,7 +250,7 @@ class GPTDriveIntegration:
202
  # Get content from top 3 most relevant files
203
  file_contents = []
204
  for file in list(unique_files)[:3]:
205
- content = self.get_file_content(file['id'], file['name'], file['mimeType'])
206
  file_contents.append({
207
  'name': file['name'],
208
  'text': content
@@ -251,13 +299,13 @@ def check_setup():
251
  status_messages = []
252
 
253
  # Check Google Drive API
254
- if hasattr(gpt_drive, 'drive_initialized') and gpt_drive.drive_initialized:
255
  status_messages.append("βœ… Google Drive API: Connected")
256
  else:
257
  status_messages.append(f"❌ Google Drive API: {getattr(gpt_drive, 'drive_error', 'Not configured')}")
258
 
259
  # Check OpenAI API
260
- if hasattr(gpt_drive, 'openai_initialized') and gpt_drive.openai_initialized:
261
  status_messages.append("βœ… OpenAI API: Connected")
262
  else:
263
  status_messages.append(f"❌ OpenAI API: {getattr(gpt_drive, 'openai_error', 'Not configured')}")
@@ -304,26 +352,44 @@ with gr.Blocks(title="Study Buddy", theme=gr.themes.Soft()) as app:
304
  interactive=False
305
  )
306
 
307
- # Event handlers
308
- submit_btn.click(
309
- fn=process_user_query,
310
- inputs=[query_input, search_terms_input],
311
- outputs=[answer_output, sources_output]
312
- )
 
 
 
 
 
313
 
314
- # Example queries
315
- with gr.Row():
316
- gr.Examples(
317
- examples=[
318
- ["What is morbid Anatomy?", "morbid, Anatomy"],
319
- ["The transmission of nerves from one neuron to another is as a result of what?", "neuron, nerves, Dr Clement"],
320
- ["Explain what the external ear contains of?", "Ear Anatomy, Ear"],
321
- ["What are the types of massage?", "massage Lecture, nerves"],
322
- ["What is trauma?", "Trauma, physical trauma and sex Offenders"],
323
- ["what is Upper limb prosthetics?", "Upper limb prosthetics"],
324
- ],
325
  inputs=[query_input, search_terms_input],
326
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  # Launch the app
328
  if __name__ == "__main__":
329
  app.launch(
 
1
  import os
2
  import json
3
  import requests
4
+ import json
5
  import tempfile
6
  from google.oauth2 import service_account
7
  from googleapiclient.discovery import build
 
9
  import openai
10
  from dotenv import load_dotenv, dotenv_values
11
  import io
 
12
 
13
  from openai import OpenAI
14
  openai.api_key = os.getenv('OPENAI_API_KEY')
15
  openai = OpenAI(api_key = openai.api_key)
16
 
17
+
18
+
19
+
20
  class GPTDriveIntegration:
21
  def __init__(self):
22
  # Build credentials info from individual environment variables
 
48
  )
49
 
50
  self.drive_service = build('drive', 'v3', credentials=self.credentials)
 
 
 
 
51
  # Initialize OpenAI
52
  openai.api_key = os.getenv('OPENAI_API_KEY')
53
 
 
58
  if file_types:
59
  type_queries = []
60
  for file_type in file_types:
61
+ if file_type.lower() == 'pdf':
 
62
  type_queries.append("mimeType='application/pdf'")
63
+ elif file_type.lower() in ['doc', 'docx']:
64
  type_queries.append("mimeType contains 'document'")
65
+ elif file_type.lower() in ['xls', 'xlsx']:
66
  type_queries.append("mimeType contains 'spreadsheet'")
 
 
 
 
67
 
68
  if type_queries:
69
  search_query += f" and ({' or '.join(type_queries)})"
 
75
 
76
  return results.get('files', [])
77
 
78
+ def get_file_content(self, file_id, mime_type):
79
+ """Download and extract text content from file"""
80
  try:
81
+ if 'text' in mime_type or 'document' in mime_type:
82
+ # For Google Docs, export as plain text
83
+ if 'document' in mime_type:
84
+ request = self.drive_service.files().export_media(
85
+ fileId=file_id, mimeType='text/plain'
86
+ )
87
+ else:
88
+ request = self.drive_service.files().get_media(fileId=file_id)
89
+
90
+ file_content = io.BytesIO()
91
+ downloader = MediaIoBaseDownload(file_content, request)
92
+ done = False
93
+ while done is False:
94
+ status, done = downloader.next_chunk()
95
+
96
+ return file_content.getvalue().decode('utf-8')
97
+
98
  elif 'spreadsheet' in mime_type:
99
+ # For Google Sheets, export as CSV
100
  request = self.drive_service.files().export_media(
101
+ fileId=file_id, mimeType='text/csv'
 
102
  )
103
+ file_content = io.BytesIO()
104
+ downloader = MediaIoBaseDownload(file_content, request)
105
+ done = False
106
+ while done is False:
107
+ status, done = downloader.next_chunk()
108
+
109
+ return file_content.getvalue().decode('utf-8')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ elif mime_type == 'application/pdf':
112
+ # For PDF files, download binary content and extract text
113
+ request = self.drive_service.files().get_media(fileId=file_id)
114
+ file_content = io.BytesIO()
115
+ downloader = MediaIoBaseDownload(file_content, request)
116
+ done = False
117
+ while done is False:
118
+ status, done = downloader.next_chunk()
119
+
120
+ # Extract text from PDF
121
+ file_content.seek(0)
122
+
123
+ try:
124
+ import PyPDF2
125
+ pdf_reader = PyPDF2.PdfReader(file_content)
126
+ text = ""
127
+ for page in pdf_reader.pages:
128
+ text += page.extract_text() + "\n"
129
+ return text
130
+ except ImportError:
131
+ return "PDF text extraction requires PyPDF2 library"
132
 
133
+ else:
134
+ return "File type not supported for text extraction"
135
+
136
  except Exception as e:
137
+ return f"Error reading file: {str(e)}"
138
 
139
+ def query_gpt_with_context(self, user_query, file_contents):
140
+ """Send query to GPT with file context"""
141
+ context = "\n\n".join([
142
+ f"File: {content['name']}\nContent: {content['text'][:2000]}..."
143
+ for content in file_contents
144
+ ])
145
 
146
+ messages = [
147
+ {
148
+ "role": "system",
149
+ "content": """
150
+ You are an AI assistant that can analyze documents from Google Drive.
151
+ Use the provided file contents to answer user questions."""
152
+ },
153
+ {
154
+ "role": "user",
155
+ "content": f"Context from Google Drive files:\n{context}\n\nUser Question: {user_query}"
156
+ }
157
+ ]
 
 
 
158
 
159
+ response = openai.chat.completions.create(
160
+ model="gpt-4o-mini",
161
+ messages=messages,
162
+ max_tokens=1000
163
+ )
164
+
165
+ return response.choices[0].message.content
166
+
167
+ def process_query(self, user_query, search_terms=None):
168
+ """Main function to process user queries"""
169
+ # Extract search terms from query if not provided
170
+ if not search_terms:
171
+ search_terms = user_query.split()[:3] # Simple extraction
172
+
173
+ # Search for relevant files
174
+ files = []
175
+ for term in search_terms:
176
+ files.extend(self.search_files(term))
177
+
178
+ # Remove duplicates
179
+ unique_files = {f['id']: f for f in files}.values()
180
+
181
+ # Get content from top 3 most relevant files
182
+ file_contents = []
183
+ for file in list(unique_files)[:3]:
184
+ content = self.get_file_content(file['id'], file['mimeType'])
185
+ file_contents.append({
186
+ 'name': file['name'],
187
+ 'text': content
188
+ })
189
+
190
+ # Query GPT with context
191
+ if file_contents:
192
+ response = self.query_gpt_with_context(user_query, file_contents)
193
+ return {
194
+ 'answer': response,
195
+ 'sources': [f['name'] for f in file_contents]
196
+ }
197
+ else:
198
+ return {
199
+ 'answer': "No relevant files found in your Google Drive.",
200
+ 'sources': []
201
+ }
202
 
203
  def query_gpt_with_context(self, user_query, file_contents):
204
  """Send query to GPT with file context"""
205
  context = "\n\n".join([
206
+ f"File: {content['name']}\nContent: {content['text'][:2000]}..."
207
  for content in file_contents
208
  ])
209
 
 
214
  You are an AI assistant that can analyze documents from Google Drive.
215
  Use the provided file contents to answer user questions.
216
  Answer directly and add additional suggestions on how to answer questions in the exam
217
+ Always end with 'Is there anything I can hel you with?'
218
  Your name is Study buddy, happy to help students study more effectively
219
+
220
  """
221
  },
222
  {
 
250
  # Get content from top 3 most relevant files
251
  file_contents = []
252
  for file in list(unique_files)[:3]:
253
+ content = self.get_file_content(file['id'], file['mimeType'])
254
  file_contents.append({
255
  'name': file['name'],
256
  'text': content
 
299
  status_messages = []
300
 
301
  # Check Google Drive API
302
+ if gpt_drive.drive_initialized:
303
  status_messages.append("βœ… Google Drive API: Connected")
304
  else:
305
  status_messages.append(f"❌ Google Drive API: {getattr(gpt_drive, 'drive_error', 'Not configured')}")
306
 
307
  # Check OpenAI API
308
+ if gpt_drive.openai_initialized:
309
  status_messages.append("βœ… OpenAI API: Connected")
310
  else:
311
  status_messages.append(f"❌ OpenAI API: {getattr(gpt_drive, 'openai_error', 'Not configured')}")
 
352
  interactive=False
353
  )
354
 
355
+ # with gr.Column(scale=1):
356
+ # # Status and setup info
357
+ # with gr.Group():
358
+ # gr.Markdown("### System Status")
359
+ # status_btn = gr.Button("Check Status", size="sm")
360
+ # status_output = gr.Textbox(
361
+ # label="API Status",
362
+ # lines=4,
363
+ # interactive=False
364
+ # )
365
+
366
 
367
+ # Event handlers
368
+ submit_btn.click(
369
+ fn=process_user_query,
 
 
 
 
 
 
 
 
370
  inputs=[query_input, search_terms_input],
371
+ outputs=[answer_output, sources_output]
372
+ )
373
+
374
+ # status_btn.click(
375
+ # fn=check_setup,
376
+ # outputs=status_output
377
+ # )
378
+
379
+ # Example queries
380
+ with gr.Row():
381
+ gr.Examples(
382
+ examples=[
383
+ ["What is morbid Anatomy?", "morbid, Anatomy"],
384
+ ["The transmission of nerves from one neuron to another is as a result of what?", "neuron, nerves, Dr Clement"],
385
+ ["Explain what the external ear contains of?", "Ear Anatomy, Ear"],
386
+ ["What are the types of massage?", "massage Lecture, nerves"],
387
+ ["What is trauma?", "Trauma, pysical trauma and sex Offenders"],
388
+ ["what is Upper limb prosthetics?", "Upper limb prosthetics"],
389
+ ],
390
+ inputs=[query_input, search_terms_input],)
391
+
392
+
393
  # Launch the app
394
  if __name__ == "__main__":
395
  app.launch(
requirements.txt CHANGED
@@ -14,5 +14,4 @@ requests
14
  langchain
15
  faiss-cpu
16
  langchain-community
17
- langchain-openai
18
- markitdown[all]
 
14
  langchain
15
  faiss-cpu
16
  langchain-community
17
+ langchain-openai