Ephraimmm commited on
Commit
b88417d
Β·
verified Β·
1 Parent(s): 2c712e9

Update ap.py

Browse files
Files changed (1) hide show
  1. ap.py +190 -444
ap.py CHANGED
@@ -8,39 +8,20 @@ from googleapiclient.http import MediaIoBaseDownload
8
  import openai
9
  from dotenv import load_dotenv, dotenv_values
10
  import io
11
- import logging
12
- from typing import List, Dict, Optional
13
-
14
- # LangChain imports
15
- from langchain.text_splitter import RecursiveCharacterTextSplitter
16
- from langchain_community.vectorstores import FAISS
17
- from langchain_openai import OpenAIEmbeddings, ChatOpenAI
18
- from langchain_community.vectorstores import FAISS
19
- from langchain.docstore.document import Document
20
- from langchain.chains import RetrievalQA
21
- from langchain.prompts import PromptTemplate
22
- from langchain.memory import ConversationBufferMemory
23
- from langchain.chains import ConversationalRetrievalChain
24
- from langchain.schema import BaseRetriever
25
- import pickle
26
- import hashlib
27
 
28
  from openai import OpenAI
29
  openai.api_key = os.getenv('OPENAI_API_KEY')
30
- openai = OpenAI(api_key=openai.api_key)
31
-
32
- # Set up logging
33
- logging.basicConfig(level=logging.INFO)
34
- logger = logging.getLogger(__name__)
35
 
36
- class EnhancedGPTDriveIntegration:
37
  def __init__(self):
38
  # Build credentials info from individual environment variables
39
  credentials_info = {
40
  "type": "service_account",
41
  "project_id": os.getenv('GOOGLE_PROJECT_ID'),
42
  "private_key_id": os.getenv('GOOGLE_PRIVATE_KEY_ID'),
43
- "private_key": os.getenv('GOOGLE_PRIVATE_KEY').replace('\\n', '\n'),
44
  "client_email": os.getenv('GOOGLE_CLIENT_EMAIL'),
45
  "client_id": os.getenv('GOOGLE_CLIENT_ID'),
46
  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
@@ -65,520 +46,285 @@ class EnhancedGPTDriveIntegration:
65
 
66
  self.drive_service = build('drive', 'v3', credentials=self.credentials)
67
 
68
- # Initialize OpenAI and LangChain components
69
- openai.api_key = os.getenv('OPENAI_API_KEY')
70
- self.embeddings = OpenAIEmbeddings()
71
- self.llm = ChatOpenAI(temperature=0.7, model="gpt-3.5-turbo")
72
-
73
- # Text splitter for better chunking
74
- self.text_splitter = RecursiveCharacterTextSplitter(
75
- chunk_size=1000,
76
- chunk_overlap=200,
77
- length_function=len,
78
- separators=["\n\n", "\n", " ", ""]
79
- )
80
-
81
- # Initialize vector store
82
- self.vector_store = None
83
- self.conversation_memory = ConversationBufferMemory(
84
- memory_key="chat_history",
85
- return_messages=True
86
- )
87
 
88
- # Cache for processed files
89
- self.processed_files = {}
90
- self.cache_file = "processed_files_cache.pkl"
91
- self.load_cache()
92
-
93
- def load_cache(self):
94
- """Load processed files cache"""
95
- try:
96
- if os.path.exists(self.cache_file):
97
- with open(self.cache_file, 'rb') as f:
98
- self.processed_files = pickle.load(f)
99
- logger.info(f"Loaded cache with {len(self.processed_files)} files")
100
- except Exception as e:
101
- logger.error(f"Error loading cache: {e}")
102
- self.processed_files = {}
103
-
104
- def save_cache(self):
105
- """Save processed files cache"""
106
- try:
107
- with open(self.cache_file, 'wb') as f:
108
- pickle.dump(self.processed_files, f)
109
- logger.info("Cache saved successfully")
110
- except Exception as e:
111
- logger.error(f"Error saving cache: {e}")
112
-
113
- def get_file_hash(self, file_id: str, file_size: str) -> str:
114
- """Generate hash for file to check if it's been processed"""
115
- return hashlib.md5(f"{file_id}_{file_size}".encode()).hexdigest()
116
 
117
- def search_files(self, query: str, file_types: Optional[List[str]] = None) -> List[Dict]:
118
- """Search for files in Google Drive with improved query handling"""
119
- # Build more sophisticated search query
120
- search_terms = query.lower().split()
121
- search_queries = []
122
-
123
- # Search in file names and content
124
- for term in search_terms:
125
- search_queries.append(f"name contains '{term}' or fullText contains '{term}'")
126
-
127
- search_query = " and ".join([f"({sq})" for sq in search_queries])
128
 
129
  if file_types:
130
  type_queries = []
131
  for file_type in file_types:
132
- if file_type.lower() == 'pdf':
 
133
  type_queries.append("mimeType='application/pdf'")
134
- elif file_type.lower() in ['doc', 'docx']:
135
  type_queries.append("mimeType contains 'document'")
136
- elif file_type.lower() in ['xls', 'xlsx']:
137
  type_queries.append("mimeType contains 'spreadsheet'")
138
- elif file_type.lower() == 'txt':
 
 
139
  type_queries.append("mimeType='text/plain'")
140
 
141
  if type_queries:
142
  search_query += f" and ({' or '.join(type_queries)})"
143
 
144
- try:
145
- results = self.drive_service.files().list(
146
- q=search_query,
147
- fields="files(id, name, mimeType, size, modifiedTime)",
148
- pageSize=20 # Increased to get more results
149
- ).execute()
150
-
151
- files = results.get('files', [])
152
- logger.info(f"Found {len(files)} files matching query: {query}")
153
- return files
154
-
155
- except Exception as e:
156
- logger.error(f"Error searching files: {e}")
157
- return []
158
 
159
- def get_file_content(self, file_id: str, mime_type: str) -> str:
160
- """Download and extract text content from file with better error handling"""
161
  try:
162
- if 'text' in mime_type or 'document' in mime_type:
163
- if 'document' in mime_type:
164
- request = self.drive_service.files().export_media(
165
- fileId=file_id, mimeType='text/plain'
166
- )
167
- else:
168
- request = self.drive_service.files().get_media(fileId=file_id)
169
-
170
- file_content = io.BytesIO()
171
- downloader = MediaIoBaseDownload(file_content, request)
172
- done = False
173
- while done is False:
174
- status, done = downloader.next_chunk()
175
-
176
- return file_content.getvalue().decode('utf-8', errors='ignore')
177
-
178
  elif 'spreadsheet' in mime_type:
 
179
  request = self.drive_service.files().export_media(
180
- fileId=file_id, mimeType='text/csv'
 
181
  )
182
- file_content = io.BytesIO()
183
- downloader = MediaIoBaseDownload(file_content, request)
184
- done = False
185
- while done is False:
186
- status, done = downloader.next_chunk()
187
-
188
- return file_content.getvalue().decode('utf-8', errors='ignore')
189
-
190
- elif mime_type == 'application/pdf':
 
191
  request = self.drive_service.files().get_media(fileId=file_id)
192
- file_content = io.BytesIO()
193
- downloader = MediaIoBaseDownload(file_content, request)
194
- done = False
195
- while done is False:
196
- status, done = downloader.next_chunk()
197
-
198
- file_content.seek(0)
199
-
200
- try:
201
- import PyPDF2
202
- pdf_reader = PyPDF2.PdfReader(file_content)
203
- text = ""
204
- for page in pdf_reader.pages:
205
- text += page.extract_text() + "\n"
206
- return text
207
- except ImportError:
208
- logger.warning("PyPDF2 not available, trying alternative PDF extraction")
209
- # Try alternative PDF extraction
210
- try:
211
- import pdfplumber
212
- with pdfplumber.open(file_content) as pdf:
213
- text = ""
214
- for page in pdf.pages:
215
- text += page.extract_text() + "\n"
216
- return text
217
- except ImportError:
218
- return "PDF text extraction requires PyPDF2 or pdfplumber library"
219
- except Exception as e:
220
- return f"Error extracting PDF text: {str(e)}"
221
 
222
- else:
223
- return "File type not supported for text extraction"
224
-
225
- except Exception as e:
226
- logger.error(f"Error reading file {file_id}: {e}")
227
- return f"Error reading file: {str(e)}"
228
-
229
- def process_documents_to_vector_store(self, files: List[Dict]) -> None:
230
- """Process documents and create/update vector store"""
231
- documents = []
232
- new_files_processed = 0
233
-
234
- for file in files:
235
- file_hash = self.get_file_hash(file['id'], file.get('size', '0'))
236
 
237
- # Check if file is already processed and hasn't changed
238
- if file_hash in self.processed_files:
239
- # Load cached documents
240
- cached_docs = self.processed_files[file_hash]
241
- documents.extend(cached_docs)
242
- continue
243
 
244
- # Process new or changed file
245
- content = self.get_file_content(file['id'], file['mimeType'])
246
 
247
- if content and not content.startswith('Error'):
248
- # Split content into chunks
249
- chunks = self.text_splitter.split_text(content)
250
-
251
- # Create Document objects with metadata
252
- file_documents = []
253
- for i, chunk in enumerate(chunks):
254
- doc = Document(
255
- page_content=chunk,
256
- metadata={
257
- 'source': file['name'],
258
- 'file_id': file['id'],
259
- 'chunk_id': i,
260
- 'mime_type': file['mimeType'],
261
- 'total_chunks': len(chunks)
262
- }
263
- )
264
- file_documents.append(doc)
265
-
266
- documents.extend(file_documents)
267
-
268
- # Cache the processed documents
269
- self.processed_files[file_hash] = file_documents
270
- new_files_processed += 1
271
-
272
- logger.info(f"Processed file: {file['name']} ({len(chunks)} chunks)")
273
 
274
- if new_files_processed > 0:
275
- self.save_cache()
276
- logger.info(f"Processed {new_files_processed} new files")
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
- # Create or update vector store
279
- if documents:
280
- if self.vector_store is None:
281
- self.vector_store = FAISS.from_documents(documents, self.embeddings)
282
- logger.info(f"Created new vector store with {len(documents)} documents")
283
- else:
284
- # Add new documents to existing vector store
285
- new_docs = [doc for file_docs in self.processed_files.values()
286
- for doc in file_docs if doc not in documents]
287
- if new_docs:
288
- self.vector_store.add_documents(new_docs)
289
- logger.info(f"Added {len(new_docs)} new documents to vector store")
290
 
291
- def create_conversational_chain(self) -> ConversationalRetrievalChain:
292
- """Create a conversational retrieval chain"""
293
- if self.vector_store is None:
294
- raise ValueError("Vector store not initialized. Process documents first.")
 
 
295
 
296
- # Create custom prompt template
297
- prompt_template = """You are Study Buddy, an AI assistant specialized in helping students study anatomy effectively.
298
- Use the following context from the student's study materials to answer their question.
299
-
300
- Context: {context}
301
-
302
- Question: {question}
303
-
304
- Instructions:
305
- 1. Answer the question directly and comprehensively using the provided context
306
- 2. If the context doesn't contain enough information, say so clearly
307
- 3. Provide study tips or exam strategies when relevant
308
- 4. Use clear, educational language appropriate for students
309
- 5. Always end your response with "Is there anything else I can help you with?"
310
-
311
- Answer:"""
312
-
313
- PROMPT = PromptTemplate(
314
- template=prompt_template,
315
- input_variables=["context", "question"]
316
- )
317
 
318
- # Create retrieval chain
319
- qa_chain = ConversationalRetrievalChain.from_llm(
320
- llm=self.llm,
321
- retriever=self.vector_store.as_retriever(
322
- search_type="similarity",
323
- search_kwargs={"k": 6} # Retrieve top 6 relevant chunks
324
- ),
325
- memory=self.conversation_memory,
326
- combine_docs_chain_kwargs={"prompt": PROMPT},
327
- return_source_documents=True,
328
- verbose=True
329
  )
330
 
331
- return qa_chain
332
 
333
- def process_query(self, user_query: str, search_terms: Optional[List[str]] = None) -> Dict:
334
- """Enhanced query processing with LangChain"""
335
- try:
336
- # Extract search terms from query if not provided
337
- if not search_terms:
338
- search_terms = user_query.lower().split()[:5] # Take first 5 words
339
-
340
- # Search for relevant files
341
- all_files = []
342
- for term in search_terms:
343
- files = self.search_files(term)
344
- all_files.extend(files)
345
-
346
- # Remove duplicates while preserving order
347
- unique_files = []
348
- seen_ids = set()
349
- for file in all_files:
350
- if file['id'] not in seen_ids:
351
- unique_files.append(file)
352
- seen_ids.add(file['id'])
353
-
354
- if not unique_files:
355
- return {
356
- 'answer': "No relevant files found in your Google Drive for this query. Please check if you have uploaded study materials related to your question.",
357
- 'sources': [],
358
- 'confidence': 'low'
359
- }
360
-
361
- # Process documents and create vector store
362
- self.process_documents_to_vector_store(unique_files[:10]) # Process top 10 files
363
-
364
- if self.vector_store is None:
365
- return {
366
- 'answer': "Unable to process the documents. Please check if the files contain readable text content.",
367
- 'sources': [],
368
- 'confidence': 'low'
369
- }
370
-
371
- # Create conversational chain and get answer
372
- qa_chain = self.create_conversational_chain()
373
-
374
- # Query the chain
375
- result = qa_chain({"question": user_query})
376
-
377
- # Extract source documents
378
- source_docs = result.get('source_documents', [])
379
- sources = list(set([doc.metadata['source'] for doc in source_docs]))
380
-
381
- # Calculate confidence based on source document relevance
382
- confidence = 'high' if len(source_docs) >= 3 else 'medium' if len(source_docs) >= 1 else 'low'
383
-
384
- return {
385
- 'answer': result['answer'],
386
- 'sources': sources,
387
- 'confidence': confidence,
388
- 'total_files_searched': len(unique_files),
389
- 'chunks_retrieved': len(source_docs)
390
- }
391
-
392
- except Exception as e:
393
- logger.error(f"Error processing query: {e}")
394
  return {
395
- 'answer': f"An error occurred while processing your query: {str(e)}. Please try again or rephrase your question.",
396
- 'sources': [],
397
- 'confidence': 'low'
398
  }
399
-
400
- def clear_memory(self):
401
- """Clear conversation memory"""
402
- self.conversation_memory.clear()
403
- logger.info("Conversation memory cleared")
404
-
405
- def get_vector_store_stats(self) -> Dict:
406
- """Get statistics about the vector store"""
407
- if self.vector_store is None:
408
- return {"total_documents": 0, "total_files": 0}
409
-
410
- try:
411
- total_docs = len(self.vector_store.docstore._dict)
412
- total_files = len(set([doc.metadata.get('source', 'Unknown')
413
- for doc in self.vector_store.docstore._dict.values()]))
414
-
415
  return {
416
- "total_documents": total_docs,
417
- "total_files": total_files,
418
- "cache_size": len(self.processed_files)
419
  }
420
- except:
421
- return {"total_documents": "Unknown", "total_files": "Unknown"}
422
 
423
- # Initialize the enhanced system
424
- enhanced_gpt_drive = EnhancedGPTDriveIntegration()
425
 
426
- def process_user_query(query: str, search_terms_input: str) -> tuple:
427
  """Process user query and return formatted response"""
428
  if not query.strip():
429
- return "Please enter a question.", "", ""
430
 
431
  # Parse search terms if provided
432
  search_terms = None
433
- if search_terms_input.strip():
434
- search_terms = [term.strip() for term in search_terms_input.split(',')]
435
 
436
  # Process the query
437
- result = enhanced_gpt_drive.process_query(query, search_terms)
438
 
439
  # Format the response
440
  answer = result['answer']
441
  sources = result['sources']
442
 
443
- # Create detailed sources text
444
  sources_text = ""
445
  if sources:
446
  sources_text = "**Sources used:**\n" + "\n".join([f"β€’ {source}" for source in sources])
447
- sources_text += f"\n\n**Search Details:**\n"
448
- sources_text += f"β€’ Files searched: {result.get('total_files_searched', 0)}\n"
449
- sources_text += f"β€’ Relevant chunks found: {result.get('chunks_retrieved', 0)}\n"
450
- sources_text += f"β€’ Confidence: {result.get('confidence', 'unknown').title()}"
451
-
452
- # Stats for display
453
- stats = enhanced_gpt_drive.get_vector_store_stats()
454
- stats_text = f"**Knowledge Base:** {stats['total_documents']} chunks from {stats['total_files']} files"
455
 
456
- return answer, sources_text, stats_text
457
-
458
- def clear_conversation():
459
- """Clear conversation memory"""
460
- enhanced_gpt_drive.clear_memory()
461
- return "Conversation history cleared. You can start a fresh conversation now."
462
 
463
- def get_system_status():
464
- """Get system status information"""
465
- stats = enhanced_gpt_drive.get_vector_store_stats()
466
 
467
- status_lines = [
468
- "βœ… Google Drive API: Connected",
469
- "βœ… OpenAI API: Connected",
470
- "βœ… LangChain: Initialized",
471
- f"πŸ“š Knowledge Base: {stats['total_documents']} document chunks",
472
- f"πŸ“ Processed Files: {stats['total_files']} files",
473
- f"πŸ’Ύ Cache Size: {stats['cache_size']} entries"
474
- ]
475
 
476
- return "\n".join(status_lines)
477
-
478
- # Create enhanced Gradio interface
479
- import gradio as gr
 
 
 
480
 
481
- with gr.Blocks(title="Enhanced Study Buddy", theme=gr.themes.Soft()) as app:
482
- gr.Markdown("# 🧠 Enhanced Anatomy Study Buddy with LangChain")
483
- gr.Markdown("Study more effectively with advanced AI-powered document analysis and conversational memory!")
 
 
484
 
485
  with gr.Row():
486
- with gr.Column(scale=3):
487
  # Main query interface
488
  with gr.Group():
489
- gr.Markdown("### πŸ’¬ Ask a Question")
490
  query_input = gr.Textbox(
491
  label="Your Question",
492
- placeholder="Ask me anything about your anatomy study materials...",
493
  lines=3
494
  )
495
 
496
  search_terms_input = gr.Textbox(
497
- label="πŸ” Search Terms (Optional)",
498
- placeholder="Enter comma-separated terms to focus the search",
499
  lines=1
500
  )
501
 
502
- with gr.Row():
503
- submit_btn = gr.Button("πŸš€ Search & Ask", variant="primary", size="lg")
504
- clear_btn = gr.Button("🧹 Clear Memory", variant="secondary")
505
 
506
  # Results section
507
  with gr.Group():
508
- gr.Markdown("### 🎯 Answer")
509
  answer_output = gr.Textbox(
510
  label="AI Response",
511
- lines=12,
512
  interactive=False
513
  )
514
 
515
  sources_output = gr.Textbox(
516
- label="πŸ“š Sources & Details",
517
- lines=6,
518
  interactive=False
519
  )
520
 
521
- with gr.Column(scale=1):
522
- # System info
523
- with gr.Group():
524
- gr.Markdown("### πŸ“Š System Status")
525
- status_btn = gr.Button("πŸ”„ Refresh Status", size="sm")
526
- status_output = gr.Textbox(
527
- label="System Information",
528
- lines=8,
529
- interactive=False
530
- )
531
-
532
- stats_output = gr.Textbox(
533
- label="Knowledge Base",
534
- lines=2,
535
- interactive=False
536
- )
537
-
538
  # Event handlers
539
  submit_btn.click(
540
  fn=process_user_query,
541
  inputs=[query_input, search_terms_input],
542
- outputs=[answer_output, sources_output, stats_output]
543
  )
544
 
545
- clear_btn.click(
546
- fn=clear_conversation,
547
- outputs=answer_output
548
- )
549
-
550
- status_btn.click(
551
- fn=get_system_status,
552
- outputs=status_output
553
- )
554
-
555
- # Enhanced examples
556
  with gr.Row():
557
  gr.Examples(
558
  examples=[
559
- ["What is morbid anatomy and how does it relate to pathology?", "morbid, anatomy, pathology"],
560
- ["Explain the neural transmission process between neurons", "neuron, transmission, synaptic"],
561
- ["Describe the complete anatomy of the external ear", "external ear, anatomy, auditory"],
562
- ["What are the different types of therapeutic massage?", "massage, therapy, treatment"],
563
- ["Define trauma and its classification in medical terms", "trauma, medical, classification"],
564
- ["Explain upper limb prosthetics and their applications", "prosthetics, upper limb, rehabilitation"],
565
- ["How does the nervous system control muscle movement?", "nervous system, muscle, motor control"],
566
- ["What are the key anatomical landmarks for injection sites?", "injection sites, anatomical landmarks"]
567
  ],
568
- inputs=[query_input, search_terms_input]
569
  )
570
-
571
- # Initial status load
572
- app.load(
573
- fn=get_system_status,
574
- outputs=status_output
575
- )
576
 
577
- # Launch the enhanced app
578
  if __name__ == "__main__":
579
- app.launch(
580
- share=True,
581
- debug=True,
582
- server_name="0.0.0.0",
583
- server_port=7860
584
- )
 
8
  import openai
9
  from dotenv import load_dotenv, dotenv_values
10
  import io
11
+ from markitdown import MarkItDown
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  from openai import OpenAI
14
  openai.api_key = os.getenv('OPENAI_API_KEY')
15
+ openai = OpenAI(api_key = openai.api_key)
 
 
 
 
16
 
17
+ class GPTDriveIntegration:
18
  def __init__(self):
19
  # Build credentials info from individual environment variables
20
  credentials_info = {
21
  "type": "service_account",
22
  "project_id": os.getenv('GOOGLE_PROJECT_ID'),
23
  "private_key_id": os.getenv('GOOGLE_PRIVATE_KEY_ID'),
24
+ "private_key": os.getenv('GOOGLE_PRIVATE_KEY').replace('\\n', '\n'), # Fix line breaks
25
  "client_email": os.getenv('GOOGLE_CLIENT_EMAIL'),
26
  "client_id": os.getenv('GOOGLE_CLIENT_ID'),
27
  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
 
46
 
47
  self.drive_service = build('drive', 'v3', credentials=self.credentials)
48
 
49
+ # Initialize MarkItDown
50
+ self.md = MarkItDown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ # Initialize OpenAI
53
+ openai.api_key = os.getenv('OPENAI_API_KEY')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ def search_files(self, query, file_types=None):
56
+ """Search for files in Google Drive"""
57
+ search_query = f"name contains '{query}'"
 
 
 
 
 
 
 
 
58
 
59
  if file_types:
60
  type_queries = []
61
  for file_type in file_types:
62
+ ext = file_type.lower().lstrip('.')
63
+ if ext == 'pdf':
64
  type_queries.append("mimeType='application/pdf'")
65
+ elif ext in ['doc', 'docx']:
66
  type_queries.append("mimeType contains 'document'")
67
+ elif ext in ['xls', 'xlsx']:
68
  type_queries.append("mimeType contains 'spreadsheet'")
69
+ elif ext in ['ppt', 'pptx']:
70
+ type_queries.append("mimeType contains 'presentation'")
71
+ elif ext in ['txt', 'md', 'markdown']:
72
  type_queries.append("mimeType='text/plain'")
73
 
74
  if type_queries:
75
  search_query += f" and ({' or '.join(type_queries)})"
76
 
77
+ results = self.drive_service.files().list(
78
+ q=search_query,
79
+ fields="files(id, name, mimeType, size)"
80
+ ).execute()
81
+
82
+ return results.get('files', [])
 
 
 
 
 
 
 
 
83
 
84
+ def get_file_content(self, file_id, file_name, mime_type):
85
+ """Download and extract content from file using MarkItDown"""
86
  try:
87
+ # Handle Google Workspace files - export to appropriate format for MarkItDown
88
+ if 'document' in mime_type:
89
+ # Export Google Docs as DOCX for better formatting preservation
90
+ request = self.drive_service.files().export_media(
91
+ fileId=file_id,
92
+ mimeType='application/vnd.openxmlformats-officedocument.wordprocessingml.document'
93
+ )
94
+ file_extension = 'docx'
 
 
 
 
 
 
 
 
95
  elif 'spreadsheet' in mime_type:
96
+ # Export Google Sheets as XLSX
97
  request = self.drive_service.files().export_media(
98
+ fileId=file_id,
99
+ mimeType='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
100
  )
101
+ file_extension = 'xlsx'
102
+ elif 'presentation' in mime_type:
103
+ # Export Google Slides as PPTX
104
+ request = self.drive_service.files().export_media(
105
+ fileId=file_id,
106
+ mimeType='application/vnd.openxmlformats-officedocument.presentationml.presentation'
107
+ )
108
+ file_extension = 'pptx'
109
+ else:
110
+ # For regular files, download as-is
111
  request = self.drive_service.files().get_media(fileId=file_id)
112
+ file_extension = self._get_extension_from_name_or_mime(file_name, mime_type)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
+ # Download file content
115
+ file_content = io.BytesIO()
116
+ downloader = MediaIoBaseDownload(file_content, request)
117
+ done = False
118
+ while done is False:
119
+ status, done = downloader.next_chunk()
 
 
 
 
 
 
 
 
120
 
121
+ # Reset stream position
122
+ file_content.seek(0)
 
 
 
 
123
 
124
+ # Use MarkItDown to convert to markdown
125
+ result = self.md.convert_stream(file_content, file_extension=file_extension)
126
 
127
+ return result.text_content
128
+
129
+ except Exception as e:
130
+ return f"Error processing file with MarkItDown: {str(e)}"
131
+
132
+ def _get_extension_from_name_or_mime(self, file_name, mime_type):
133
+ """Helper to determine file extension for MarkItDown"""
134
+ # First try to get extension from filename
135
+ if '.' in file_name:
136
+ return file_name.split('.')[-1].lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
+ # Fallback to mime type mapping
139
+ mime_to_ext = {
140
+ 'application/pdf': 'pdf',
141
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
142
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
143
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
144
+ 'application/msword': 'doc',
145
+ 'application/vnd.ms-excel': 'xls',
146
+ 'application/vnd.ms-powerpoint': 'ppt',
147
+ 'text/plain': 'txt',
148
+ 'text/markdown': 'md',
149
+ 'text/html': 'html',
150
+ 'application/json': 'json',
151
+ 'text/csv': 'csv'
152
+ }
153
 
154
+ return mime_to_ext.get(mime_type, 'txt')
 
 
 
 
 
 
 
 
 
 
 
155
 
156
+ def query_gpt_with_context(self, user_query, file_contents):
157
+ """Send query to GPT with file context"""
158
+ context = "\n\n".join([
159
+ f"File: {content['name']}\nContent: {content['text'][:3000]}..."
160
+ for content in file_contents
161
+ ])
162
 
163
+ messages = [
164
+ {
165
+ "role": "system",
166
+ "content": """
167
+ You are an AI assistant that can analyze documents from Google Drive.
168
+ Use the provided file contents to answer user questions.
169
+ Answer directly and add additional suggestions on how to answer questions in the exam
170
+ Always end with 'Is there anything I can help you with?'
171
+ Your name is Study buddy, happy to help students study more effectively
172
+ """
173
+ },
174
+ {
175
+ "role": "user",
176
+ "content": f"Context from Google Drive files:\n{context}\n\nUser Question: {user_query}"
177
+ }
178
+ ]
 
 
 
 
 
179
 
180
+ response = openai.chat.completions.create(
181
+ model="gpt-4o-mini",
182
+ messages=messages,
183
+ max_tokens=1000
 
 
 
 
 
 
 
184
  )
185
 
186
+ return response.choices[0].message.content
187
 
188
+ def process_query(self, user_query, search_terms=None):
189
+ """Main function to process user queries"""
190
+ # Extract search terms from query if not provided
191
+ if not search_terms:
192
+ search_terms = user_query.split()[:3] # Simple extraction
193
+
194
+ # Search for relevant files
195
+ files = []
196
+ for term in search_terms:
197
+ files.extend(self.search_files(term))
198
+
199
+ # Remove duplicates
200
+ unique_files = {f['id']: f for f in files}.values()
201
+
202
+ # Get content from top 3 most relevant files
203
+ file_contents = []
204
+ for file in list(unique_files)[:3]:
205
+ content = self.get_file_content(file['id'], file['name'], file['mimeType'])
206
+ file_contents.append({
207
+ 'name': file['name'],
208
+ 'text': content
209
+ })
210
+
211
+ # Query GPT with context
212
+ if file_contents:
213
+ response = self.query_gpt_with_context(user_query, file_contents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  return {
215
+ 'answer': response,
216
+ 'sources': [f['name'] for f in file_contents]
 
217
  }
218
+ else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  return {
220
+ 'answer': "No relevant files found in your Google Drive.",
221
+ 'sources': []
 
222
  }
 
 
223
 
224
+ gpt_drive = GPTDriveIntegration()
 
225
 
226
+ def process_user_query(query, search_terms_input):
227
  """Process user query and return formatted response"""
228
  if not query.strip():
229
+ return "Please enter a question.", ""
230
 
231
  # Parse search terms if provided
232
  search_terms = None
233
+ # if search_terms_input.strip():
234
+ # search_terms = [term.strip() for term in search_terms_input.split(',')]
235
 
236
  # Process the query
237
+ result = gpt_drive.process_query(query, search_terms)
238
 
239
  # Format the response
240
  answer = result['answer']
241
  sources = result['sources']
242
 
 
243
  sources_text = ""
244
  if sources:
245
  sources_text = "**Sources used:**\n" + "\n".join([f"β€’ {source}" for source in sources])
 
 
 
 
 
 
 
 
246
 
247
+ return answer, sources_text
 
 
 
 
 
248
 
249
+ def check_setup():
250
+ """Check if the APIs are properly configured"""
251
+ status_messages = []
252
 
253
+ # Check Google Drive API
254
+ if hasattr(gpt_drive, 'drive_initialized') and gpt_drive.drive_initialized:
255
+ status_messages.append("βœ… Google Drive API: Connected")
256
+ else:
257
+ status_messages.append(f"❌ Google Drive API: {getattr(gpt_drive, 'drive_error', 'Not configured')}")
 
 
 
258
 
259
+ # Check OpenAI API
260
+ if hasattr(gpt_drive, 'openai_initialized') and gpt_drive.openai_initialized:
261
+ status_messages.append("βœ… OpenAI API: Connected")
262
+ else:
263
+ status_messages.append(f"❌ OpenAI API: {getattr(gpt_drive, 'openai_error', 'Not configured')}")
264
+
265
+ return "\n".join(status_messages)
266
 
267
+ # Create Gradio interface
268
+ import gradio as gr
269
+ with gr.Blocks(title="Study Buddy", theme=gr.themes.Soft()) as app:
270
+ gr.Markdown("# Anatomy Study Buddy ")
271
+ gr.Markdown("Study more effectively with study Buddy!")
272
 
273
  with gr.Row():
274
+ with gr.Column(scale=2):
275
  # Main query interface
276
  with gr.Group():
277
+ gr.Markdown("### Ask a Question")
278
  query_input = gr.Textbox(
279
  label="Your Question",
280
+ placeholder="Ask me any question about your anatomy books?",
281
  lines=3
282
  )
283
 
284
  search_terms_input = gr.Textbox(
285
+ label="Search Terms",
286
+ placeholder="Enter comma-separated terms to search for specific files",
287
  lines=1
288
  )
289
 
290
+ submit_btn = gr.Button("Search & Ask", variant="primary", size="lg")
 
 
291
 
292
  # Results section
293
  with gr.Group():
294
+ gr.Markdown("### Answer")
295
  answer_output = gr.Textbox(
296
  label="AI Response",
297
+ lines=10,
298
  interactive=False
299
  )
300
 
301
  sources_output = gr.Textbox(
302
+ label="Sources",
303
+ lines=3,
304
  interactive=False
305
  )
306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  # Event handlers
308
  submit_btn.click(
309
  fn=process_user_query,
310
  inputs=[query_input, search_terms_input],
311
+ outputs=[answer_output, sources_output]
312
  )
313
 
314
+ # Example queries
 
 
 
 
 
 
 
 
 
 
315
  with gr.Row():
316
  gr.Examples(
317
  examples=[
318
+ ["What is morbid Anatomy?", "morbid, Anatomy"],
319
+ ["The transmission of nerves from one neuron to another is as a result of what?", "neuron, nerves, Dr Clement"],
320
+ ["Explain what the external ear contains of?", "Ear Anatomy, Ear"],
321
+ ["What are the types of massage?", "massage Lecture, nerves"],
322
+ ["What is trauma?", "Trauma, physical trauma and sex Offenders"],
323
+ ["what is Upper limb prosthetics?", "Upper limb prosthetics"],
 
 
324
  ],
325
+ inputs=[query_input, search_terms_input],
326
  )
 
 
 
 
 
 
327
 
328
+ # Launch the app
329
  if __name__ == "__main__":
330
+ app.launch(share=True, debug=True)