Redfire-1234 commited on
Commit
f80f8fd
·
verified ·
1 Parent(s): 0c0f014

Update app/main.py

Browse files
Files changed (1) hide show
  1. app/main.py +220 -442
app/main.py CHANGED
@@ -1,6 +1,5 @@
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.staticfiles import StaticFiles
3
- from fastapi.responses import FileResponse
4
  from fastapi.middleware.cors import CORSMiddleware
5
  from typing import List
6
  import os
@@ -23,30 +22,16 @@ app = FastAPI(
23
  @app.on_event("startup")
24
  async def startup_event():
25
  """Display clickable link on startup"""
26
- import os
27
-
28
- # Detect if running on HuggingFace Spaces
29
- space_id = os.getenv("SPACE_ID")
30
-
31
  print("\n" + "="*70)
32
  print("🚀 Google Docs Knowledge Chatbot is running!")
33
  print("="*70)
34
-
35
- if space_id:
36
- # Running on HuggingFace Spaces
37
- print("\n📱 Application deployed on HuggingFace Spaces")
38
- print(f" Space ID: {space_id}")
39
- else:
40
- # Running locally
41
- print("\n📱 Access the application here:")
42
- print("\n 👉 \033[94m\033[4mhttp://localhost:8000\033[0m\n")
43
-
44
  print("="*70)
45
  print("\n💡 Quick Tips:")
46
  print(" • Click 'Index All Documents' to get started")
47
  print(" • Make sure your Google Drive folder is shared")
48
- if not space_id:
49
- print(" • Press CTRL+C to stop the server")
50
  print("\n" + "="*70 + "\n")
51
 
52
  # Add CORS middleware
@@ -62,7 +47,7 @@ app.add_middleware(
62
  settings = get_settings()
63
 
64
  # Initialize services
65
- drive_service = GoogleDriveService(settings.get_google_credentials_dict())
66
  chunker = TextChunker(chunk_size=settings.chunk_size, chunk_overlap=settings.chunk_overlap)
67
  embedding_engine = EmbeddingEngine()
68
  llm_service = LLMService(settings.groq_api_key)
@@ -70,18 +55,9 @@ llm_service = LLMService(settings.groq_api_key)
70
  # Create data directory
71
  os.makedirs(settings.vector_store_path, exist_ok=True)
72
 
73
- # Mount static files
74
- app.mount("/static", StaticFiles(directory="frontend"), name="static")
75
-
76
 
77
  @app.get("/")
78
  async def root():
79
- """Serve the frontend HTML"""
80
- return FileResponse("frontend/index.html")
81
-
82
-
83
- @app.get("/api/status")
84
- async def api_status():
85
  """Health check endpoint"""
86
  return {
87
  "status": "running",
@@ -129,16 +105,67 @@ async def index_all_documents():
129
  """
130
  try:
131
  # Get all documents in folder
132
- docs = drive_service.list_documents_in_folder(settings.google_drive_folder_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  if not docs:
135
- raise HTTPException(status_code=404, detail="No documents found in the configured folder")
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  print(f"Found {len(docs)} documents in folder")
138
 
139
  # Initialize vector store
140
  vector_store = VectorStore(dimension=embedding_engine.dimension)
141
  total_chunks = 0
 
 
142
 
143
  # Process each document
144
  for doc in docs:
@@ -146,23 +173,82 @@ async def index_all_documents():
146
  print(f"Processing: {doc['name']} ({doc['id']})")
147
 
148
  # Read document
149
- text = drive_service.get_document_content(doc['id'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
 
151
  if not text or len(text.strip()) == 0:
152
- print(f" Skipping empty document: {doc['name']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  continue
154
 
155
  # Chunk text
156
  chunks = chunker.chunk_text(text)
157
 
158
  if not chunks:
159
- print(f" No chunks created for: {doc['name']}")
 
 
 
 
160
  continue
161
 
162
  print(f" Created {len(chunks)} chunks")
163
 
164
- # Generate embeddings
165
- embeddings = embedding_engine.encode(chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  # Add to vector store with metadata
168
  metadata = {
@@ -173,28 +259,64 @@ async def index_all_documents():
173
  vector_store.add_documents(chunks, embeddings, metadata)
174
 
175
  total_chunks += len(chunks)
176
- print(f" Added {len(chunks)} chunks to index")
 
177
 
178
  except Exception as e:
179
- print(f" Error processing {doc['name']}: {str(e)}")
 
 
 
 
180
  continue
181
 
182
  if total_chunks == 0:
183
- raise HTTPException(status_code=400, detail="No valid content to index")
 
 
 
 
 
 
 
 
 
 
184
 
185
  # Save the unified vector store
186
  vector_store.save(settings.vector_store_path, "all_docs")
187
 
188
- return IndexResponse(
189
- message=f"Successfully indexed all documents from folder",
190
- chunks_indexed=total_chunks,
191
- documents_processed=len(docs)
192
- )
 
 
 
 
 
 
 
 
 
 
193
 
194
  except HTTPException:
195
  raise
196
  except Exception as e:
197
- raise HTTPException(status_code=500, detail=f"Error indexing documents: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
198
 
199
 
200
  @app.post("/index-document", response_model=IndexResponse)
@@ -342,24 +464,68 @@ async def chat(request: ChatRequest):
342
  except HTTPException:
343
  raise
344
  except Exception as e:
345
- # Better error handling
346
  error_msg = str(e)
347
 
348
- # Check for rate limit errors
349
- if "rate_limit" in error_msg.lower() or "429" in error_msg:
350
  raise HTTPException(
351
  status_code=429,
352
- detail="Rate limit exceeded. Please wait a moment and try again."
 
 
 
 
 
 
 
 
 
353
  )
354
 
355
- # Check for API errors
356
- if "api" in error_msg.lower() or "authentication" in error_msg.lower():
357
  raise HTTPException(
358
  status_code=503,
359
- detail="LLM service temporarily unavailable. Please try again later."
 
 
 
 
 
 
 
 
 
360
  )
361
 
362
- raise HTTPException(status_code=500, detail=f"Error processing chat: {error_msg}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
 
365
  @app.post("/reindex")
@@ -410,394 +576,6 @@ async def clear_index():
410
  except Exception as e:
411
  raise HTTPException(status_code=500, detail=f"Error clearing index: {str(e)}")
412
 
413
- # from fastapi import FastAPI, HTTPException
414
- # from fastapi.staticfiles import StaticFiles
415
- # from fastapi.middleware.cors import CORSMiddleware
416
- # from typing import List
417
- # import os
418
-
419
- # from app.config import get_settings
420
- # from app.models import ChatRequest, ChatResponse, IndexRequest, IndexResponse, DocumentInfo
421
- # from app.services.google_drive import GoogleDriveService
422
- # from app.services.chunker import TextChunker
423
- # from app.services.embeddings import EmbeddingEngine
424
- # from app.services.vector_store import VectorStore
425
- # from app.services.llm import LLMService
426
-
427
- # # Initialize FastAPI app
428
- # app = FastAPI(
429
- # title="Google Docs Knowledge Chatbot",
430
- # description="RAG-based chatbot for Google Docs with folder support",
431
- # version="2.0.0"
432
- # )
433
-
434
- # @app.on_event("startup")
435
- # async def startup_event():
436
- # """Display clickable link on startup"""
437
- # print("\n" + "="*70)
438
- # print("🚀 Google Docs Knowledge Chatbot is running!")
439
- # print("="*70)
440
- # print("\n📱 Access the application here:")
441
- # print("\n 👉 \033[94m\033[4mhttp://localhost:8000/static/index.html\033[0m\n")
442
- # print("="*70)
443
- # print("\n💡 Quick Tips:")
444
- # print(" • Click 'Index All Documents' to get started")
445
- # print(" • Make sure your Google Drive folder is shared")
446
- # print(" • Press CTRL+C to stop the server")
447
- # print("\n" + "="*70 + "\n")
448
-
449
- # # Add CORS middleware
450
- # app.add_middleware(
451
- # CORSMiddleware,
452
- # allow_origins=["*"],
453
- # allow_credentials=True,
454
- # allow_methods=["*"],
455
- # allow_headers=["*"],
456
- # )
457
-
458
- # # Get settings
459
- # settings = get_settings()
460
-
461
- # # Initialize services
462
- # drive_service = GoogleDriveService(settings.get_google_credentials_dict())
463
- # chunker = TextChunker(chunk_size=settings.chunk_size, chunk_overlap=settings.chunk_overlap)
464
- # embedding_engine = EmbeddingEngine()
465
- # llm_service = LLMService(settings.groq_api_key)
466
-
467
- # # Create data directory
468
- # os.makedirs(settings.vector_store_path, exist_ok=True)
469
-
470
-
471
- # @app.get("/")
472
- # async def root():
473
- # """Health check endpoint"""
474
- # return {
475
- # "status": "running",
476
- # "message": "Google Docs Knowledge Chatbot API v2.0",
477
- # "features": ["folder-based", "multi-document", "auto-discovery"]
478
- # }
479
-
480
-
481
- # @app.get("/documents", response_model=List[DocumentInfo])
482
- # async def list_documents():
483
- # """
484
- # List all documents in the configured Google Drive folder
485
- # """
486
- # try:
487
- # docs = drive_service.list_documents_in_folder(settings.google_drive_folder_id)
488
-
489
- # # Check which ones are indexed
490
- # result = []
491
- # for doc in docs:
492
- # indexed = os.path.exists(
493
- # os.path.join(settings.vector_store_path, f"all_docs_index.faiss")
494
- # )
495
- # result.append(DocumentInfo(
496
- # id=doc['id'],
497
- # name=doc['name'],
498
- # modified=doc['modified'],
499
- # indexed=indexed
500
- # ))
501
-
502
- # return result
503
-
504
- # except Exception as e:
505
- # raise HTTPException(status_code=500, detail=f"Error listing documents: {str(e)}")
506
-
507
-
508
- # @app.post("/index-all", response_model=IndexResponse)
509
- # async def index_all_documents():
510
- # """
511
- # Index ALL documents in the Google Drive folder
512
-
513
- # This is the recommended approach:
514
- # - Automatically discovers all docs in folder
515
- # - Creates one unified vector store
516
- # - No need to index individually
517
- # """
518
- # try:
519
- # # Get all documents in folder
520
- # docs = drive_service.list_documents_in_folder(settings.google_drive_folder_id)
521
-
522
- # if not docs:
523
- # raise HTTPException(status_code=404, detail="No documents found in the configured folder")
524
-
525
- # print(f"Found {len(docs)} documents in folder")
526
-
527
- # # Initialize vector store
528
- # vector_store = VectorStore(dimension=embedding_engine.dimension)
529
- # total_chunks = 0
530
-
531
- # # Process each document
532
- # for doc in docs:
533
- # try:
534
- # print(f"Processing: {doc['name']} ({doc['id']})")
535
-
536
- # # Read document
537
- # text = drive_service.get_document_content(doc['id'])
538
-
539
- # if not text or len(text.strip()) == 0:
540
- # print(f" Skipping empty document: {doc['name']}")
541
- # continue
542
-
543
- # # Chunk text
544
- # chunks = chunker.chunk_text(text)
545
-
546
- # if not chunks:
547
- # print(f" No chunks created for: {doc['name']}")
548
- # continue
549
-
550
- # print(f" Created {len(chunks)} chunks")
551
-
552
- # # Generate embeddings
553
- # embeddings = embedding_engine.encode(chunks)
554
-
555
- # # Add to vector store with metadata
556
- # metadata = {
557
- # 'doc_id': doc['id'],
558
- # 'doc_name': doc['name'],
559
- # 'modified': doc['modified']
560
- # }
561
- # vector_store.add_documents(chunks, embeddings, metadata)
562
-
563
- # total_chunks += len(chunks)
564
- # print(f" Added {len(chunks)} chunks to index")
565
-
566
- # except Exception as e:
567
- # print(f" Error processing {doc['name']}: {str(e)}")
568
- # continue
569
-
570
- # if total_chunks == 0:
571
- # raise HTTPException(status_code=400, detail="No valid content to index")
572
-
573
- # # Save the unified vector store
574
- # vector_store.save(settings.vector_store_path, "all_docs")
575
-
576
- # return IndexResponse(
577
- # message=f"Successfully indexed all documents from folder",
578
- # chunks_indexed=total_chunks,
579
- # documents_processed=len(docs)
580
- # )
581
-
582
- # except HTTPException:
583
- # raise
584
- # except Exception as e:
585
- # raise HTTPException(status_code=500, detail=f"Error indexing documents: {str(e)}")
586
-
587
-
588
- # @app.post("/index-document", response_model=IndexResponse)
589
- # async def index_single_document(request: IndexRequest):
590
- # """
591
- # Index a single document (legacy support)
592
-
593
- # Note: It's better to use /index-all to index the entire folder
594
- # """
595
- # try:
596
- # if not request.document_id:
597
- # # If no doc ID provided, index all
598
- # return await index_all_documents()
599
-
600
- # document_id = request.document_id
601
-
602
- # # Read document
603
- # print(f"Reading document: {document_id}")
604
- # text = drive_service.get_document_content(document_id)
605
- # metadata = drive_service.get_document_metadata(document_id)
606
-
607
- # if not text or len(text.strip()) == 0:
608
- # raise HTTPException(status_code=400, detail="Document is empty")
609
-
610
- # # Chunk text
611
- # chunks = chunker.chunk_text(text)
612
-
613
- # if not chunks:
614
- # raise HTTPException(status_code=400, detail="No valid chunks created")
615
-
616
- # print(f"Created {len(chunks)} chunks")
617
-
618
- # # Generate embeddings
619
- # embeddings = embedding_engine.encode(chunks)
620
-
621
- # # Load existing vector store or create new
622
- # vector_store = VectorStore(dimension=embedding_engine.dimension)
623
- # vector_store.load(settings.vector_store_path, "all_docs")
624
-
625
- # # Add to vector store
626
- # doc_metadata = {
627
- # 'doc_id': metadata['id'],
628
- # 'doc_name': metadata['name'],
629
- # 'modified': metadata['modified']
630
- # }
631
- # vector_store.add_documents(chunks, embeddings, doc_metadata)
632
- # vector_store.save(settings.vector_store_path, "all_docs")
633
-
634
- # return IndexResponse(
635
- # message=f"Successfully indexed document: {metadata['name']}",
636
- # chunks_indexed=len(chunks),
637
- # documents_processed=1
638
- # )
639
-
640
- # except HTTPException:
641
- # raise
642
- # except Exception as e:
643
- # raise HTTPException(status_code=500, detail=f"Error indexing document: {str(e)}")
644
-
645
-
646
- # @app.post("/chat", response_model=ChatResponse)
647
- # async def chat(request: ChatRequest):
648
- # """
649
- # Chat endpoint - searches across ALL indexed documents
650
-
651
- # Features:
652
- # - Conversation history support (last 5 exchanges)
653
- # - Query clarity checking (only for first question)
654
- # - Automatic query rephrasing with context
655
- # - Context-aware responses
656
- # """
657
- # try:
658
- # question = request.question
659
- # conversation_history = [msg.dict() for msg in request.conversation_history]
660
-
661
- # # Step 1: Check if query needs clarification (ONLY if no conversation history)
662
- # is_clear, clarification = llm_service.check_query_clarity(question, conversation_history)
663
-
664
- # if not is_clear and clarification:
665
- # return ChatResponse(
666
- # answer=clarification,
667
- # sources=[],
668
- # is_clarification=True,
669
- # rephrased_query=None
670
- # )
671
-
672
- # # Step 2: Rephrase query if there's conversation history
673
- # rephrased_query = None
674
- # search_query = question
675
-
676
- # if conversation_history and len(conversation_history) > 0:
677
- # rephrased = llm_service.rephrase_query(question, conversation_history)
678
- # if rephrased and rephrased.lower() != question.lower():
679
- # rephrased_query = rephrased
680
- # search_query = rephrased
681
- # print(f"Original: {question}")
682
- # print(f"Rephrased: {rephrased}")
683
-
684
- # # Step 3: Load the unified vector store
685
- # vector_store = VectorStore(dimension=embedding_engine.dimension)
686
-
687
- # if not vector_store.load(settings.vector_store_path, "all_docs"):
688
- # raise HTTPException(
689
- # status_code=404,
690
- # detail="No documents indexed. Please use /index-all to index your folder first."
691
- # )
692
-
693
- # # Step 4: Generate query embedding (use rephrased query if available)
694
- # query_embedding = embedding_engine.encode_single(search_query)
695
-
696
- # # Step 5: Retrieve relevant chunks
697
- # results = vector_store.search(query_embedding, k=settings.top_k_results)
698
-
699
- # if not results:
700
- # return ChatResponse(
701
- # answer="I couldn't find any relevant information in the indexed documents to answer your question. Could you please rephrase or ask about something else?",
702
- # sources=[],
703
- # is_clarification=False,
704
- # rephrased_query=rephrased_query
705
- # )
706
-
707
- # # Step 6: Extract chunks and prepare sources
708
- # relevant_chunks = []
709
- # sources = []
710
-
711
- # for i, (chunk, distance, metadata) in enumerate(results):
712
- # relevant_chunks.append(chunk)
713
- # doc_name = metadata.get('doc_name', 'Unknown Document')
714
- # sources.append(f"📄 {doc_name}: {chunk[:100]}...")
715
-
716
- # # Step 7: Generate answer with conversation history
717
- # answer = llm_service.generate_answer(
718
- # relevant_chunks,
719
- # question, # Use original question for answer generation
720
- # conversation_history
721
- # )
722
-
723
- # return ChatResponse(
724
- # answer=answer,
725
- # sources=sources,
726
- # is_clarification=False,
727
- # rephrased_query=rephrased_query
728
- # )
729
-
730
- # except HTTPException:
731
- # raise
732
- # except Exception as e:
733
- # # Better error handling
734
- # error_msg = str(e)
735
-
736
- # # Check for rate limit errors
737
- # if "rate_limit" in error_msg.lower() or "429" in error_msg:
738
- # raise HTTPException(
739
- # status_code=429,
740
- # detail="Rate limit exceeded. Please wait a moment and try again."
741
- # )
742
-
743
- # # Check for API errors
744
- # if "api" in error_msg.lower() or "authentication" in error_msg.lower():
745
- # raise HTTPException(
746
- # status_code=503,
747
- # detail="LLM service temporarily unavailable. Please try again later."
748
- # )
749
-
750
- # raise HTTPException(status_code=500, detail=f"Error processing chat: {error_msg}")
751
-
752
-
753
- # @app.post("/reindex")
754
- # async def reindex_all():
755
- # """
756
- # Re-index all documents (useful when docs are updated)
757
-
758
- # Call this endpoint when:
759
- # - You've updated documents in the folder
760
- # - You've added new documents
761
- # - You want to refresh the index
762
- # """
763
- # try:
764
- # # Clear existing index
765
- # vector_store = VectorStore(dimension=embedding_engine.dimension)
766
- # vector_store.clear()
767
-
768
- # # Re-index everything
769
- # return await index_all_documents()
770
-
771
- # except Exception as e:
772
- # raise HTTPException(status_code=500, detail=f"Error re-indexing: {str(e)}")
773
-
774
-
775
- # @app.delete("/clear-index")
776
- # async def clear_index():
777
- # """Delete all indexed data"""
778
- # try:
779
- # index_path = os.path.join(settings.vector_store_path, "all_docs_index.faiss")
780
- # data_path = os.path.join(settings.vector_store_path, "all_docs_data.pkl")
781
-
782
- # deleted = False
783
- # if os.path.exists(index_path):
784
- # os.remove(index_path)
785
- # deleted = True
786
-
787
- # if os.path.exists(data_path):
788
- # os.remove(data_path)
789
- # deleted = True
790
-
791
- # if deleted:
792
- # return {"message": "Successfully cleared all indexed data"}
793
- # else:
794
- # raise HTTPException(status_code=404, detail="No index found")
795
-
796
- # except HTTPException:
797
- # raise
798
- # except Exception as e:
799
- # raise HTTPException(status_code=500, detail=f"Error clearing index: {str(e)}")
800
-
801
 
802
- # # Serve frontend
803
- # app.mount("/static", StaticFiles(directory="frontend"), name="static")
 
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.staticfiles import StaticFiles
 
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from typing import List
5
  import os
 
22
  @app.on_event("startup")
23
  async def startup_event():
24
  """Display clickable link on startup"""
 
 
 
 
 
25
  print("\n" + "="*70)
26
  print("🚀 Google Docs Knowledge Chatbot is running!")
27
  print("="*70)
28
+ print("\n📱 Access the application here:")
29
+ print("\n 👉 \033[94m\033[4mhttp://localhost:8000/static/index.html\033[0m\n")
 
 
 
 
 
 
 
 
30
  print("="*70)
31
  print("\n💡 Quick Tips:")
32
  print(" • Click 'Index All Documents' to get started")
33
  print(" • Make sure your Google Drive folder is shared")
34
+ print(" • Press CTRL+C to stop the server")
 
35
  print("\n" + "="*70 + "\n")
36
 
37
  # Add CORS middleware
 
47
  settings = get_settings()
48
 
49
  # Initialize services
50
+ drive_service = GoogleDriveService(settings.google_application_credentials)
51
  chunker = TextChunker(chunk_size=settings.chunk_size, chunk_overlap=settings.chunk_overlap)
52
  embedding_engine = EmbeddingEngine()
53
  llm_service = LLMService(settings.groq_api_key)
 
55
  # Create data directory
56
  os.makedirs(settings.vector_store_path, exist_ok=True)
57
 
 
 
 
58
 
59
  @app.get("/")
60
  async def root():
 
 
 
 
 
 
61
  """Health check endpoint"""
62
  return {
63
  "status": "running",
 
105
  """
106
  try:
107
  # Get all documents in folder
108
+ try:
109
+ docs = drive_service.list_documents_in_folder(settings.google_drive_folder_id)
110
+ except Exception as e:
111
+ error_msg = str(e)
112
+
113
+ # Handle permission/access errors
114
+ if "403" in error_msg or "Permission denied" in error_msg:
115
+ raise HTTPException(
116
+ status_code=403,
117
+ detail={
118
+ "error": "Permission Denied",
119
+ "message": "Cannot access Google Drive folder. Please ensure:",
120
+ "steps": [
121
+ "1. The folder is shared with your service account email",
122
+ "2. Service account has at least 'Viewer' access",
123
+ "3. Check GOOGLE_DRIVE_FOLDER_ID in your .env file",
124
+ "4. Both Google Drive API and Google Docs API are enabled"
125
+ ],
126
+ "service_account_help": "Find your service account email in credentials.json under 'client_email'"
127
+ }
128
+ )
129
+
130
+ # Handle folder not found
131
+ elif "404" in error_msg or "not found" in error_msg.lower():
132
+ raise HTTPException(
133
+ status_code=404,
134
+ detail={
135
+ "error": "Folder Not Found",
136
+ "message": "The specified Google Drive folder does not exist.",
137
+ "steps": [
138
+ "1. Check your GOOGLE_DRIVE_FOLDER_ID in .env file",
139
+ "2. Verify the folder exists in Google Drive",
140
+ "3. Make sure you copied the correct folder ID from the URL"
141
+ ],
142
+ "example": "Folder URL: https://drive.google.com/drive/folders/YOUR_FOLDER_ID"
143
+ }
144
+ )
145
+
146
+ raise
147
 
148
  if not docs:
149
+ raise HTTPException(
150
+ status_code=404,
151
+ detail={
152
+ "error": "No Documents Found",
153
+ "message": "The folder exists but contains no Google Docs.",
154
+ "steps": [
155
+ "1. Add Google Docs to your shared folder",
156
+ "2. Make sure they are Google Docs (not PDFs or Word files)",
157
+ "3. Check that documents aren't in subfolders"
158
+ ]
159
+ }
160
+ )
161
 
162
  print(f"Found {len(docs)} documents in folder")
163
 
164
  # Initialize vector store
165
  vector_store = VectorStore(dimension=embedding_engine.dimension)
166
  total_chunks = 0
167
+ processed_docs = 0
168
+ failed_docs = []
169
 
170
  # Process each document
171
  for doc in docs:
 
173
  print(f"Processing: {doc['name']} ({doc['id']})")
174
 
175
  # Read document
176
+ try:
177
+ text = drive_service.get_document_content(doc['id'])
178
+ except Exception as e:
179
+ error_msg = str(e)
180
+
181
+ # Document is private/not shared
182
+ if "403" in error_msg or "Permission denied" in error_msg:
183
+ failed_docs.append({
184
+ "name": doc['name'],
185
+ "error": "Permission denied - document not shared with service account"
186
+ })
187
+ print(f" ⚠️ Skipping {doc['name']}: Permission denied")
188
+ continue
189
+
190
+ # Document deleted or invalid
191
+ elif "404" in error_msg:
192
+ failed_docs.append({
193
+ "name": doc['name'],
194
+ "error": "Document not found or deleted"
195
+ })
196
+ print(f" ⚠️ Skipping {doc['name']}: Not found")
197
+ continue
198
+
199
+ raise
200
 
201
+ # Handle empty documents
202
  if not text or len(text.strip()) == 0:
203
+ failed_docs.append({
204
+ "name": doc['name'],
205
+ "error": "Document is empty"
206
+ })
207
+ print(f" ⚠️ Skipping empty document: {doc['name']}")
208
+ continue
209
+
210
+ # Check minimum content length
211
+ if len(text.strip()) < 50:
212
+ failed_docs.append({
213
+ "name": doc['name'],
214
+ "error": f"Document too short ({len(text)} characters, minimum 50 required)"
215
+ })
216
+ print(f" ⚠️ Skipping {doc['name']}: Too short")
217
  continue
218
 
219
  # Chunk text
220
  chunks = chunker.chunk_text(text)
221
 
222
  if not chunks:
223
+ failed_docs.append({
224
+ "name": doc['name'],
225
+ "error": "Could not create valid chunks from document"
226
+ })
227
+ print(f" ⚠️ No chunks created for: {doc['name']}")
228
  continue
229
 
230
  print(f" Created {len(chunks)} chunks")
231
 
232
+ # Generate embeddings with retry logic
233
+ max_retries = 3
234
+ retry_delay = 2
235
+
236
+ for attempt in range(max_retries):
237
+ try:
238
+ embeddings = embedding_engine.encode(chunks)
239
+ break
240
+ except Exception as e:
241
+ if attempt < max_retries - 1:
242
+ print(f" Retry {attempt + 1}/{max_retries} for embeddings...")
243
+ import time
244
+ time.sleep(retry_delay)
245
+ else:
246
+ failed_docs.append({
247
+ "name": doc['name'],
248
+ "error": f"Failed to generate embeddings after {max_retries} attempts"
249
+ })
250
+ print(f" ❌ Failed to generate embeddings for: {doc['name']}")
251
+ continue
252
 
253
  # Add to vector store with metadata
254
  metadata = {
 
259
  vector_store.add_documents(chunks, embeddings, metadata)
260
 
261
  total_chunks += len(chunks)
262
+ processed_docs += 1
263
+ print(f" ✅ Added {len(chunks)} chunks to index")
264
 
265
  except Exception as e:
266
+ failed_docs.append({
267
+ "name": doc['name'],
268
+ "error": str(e)
269
+ })
270
+ print(f" ❌ Error processing {doc['name']}: {str(e)}")
271
  continue
272
 
273
  if total_chunks == 0:
274
+ error_detail = {
275
+ "error": "No Content Indexed",
276
+ "message": "All documents failed to index.",
277
+ "failed_documents": failed_docs,
278
+ "steps": [
279
+ "1. Check that documents have actual content",
280
+ "2. Ensure documents are shared with service account",
281
+ "3. Verify documents are Google Docs (not PDFs/Word)"
282
+ ]
283
+ }
284
+ raise HTTPException(status_code=400, detail=error_detail)
285
 
286
  # Save the unified vector store
287
  vector_store.save(settings.vector_store_path, "all_docs")
288
 
289
+ response_detail = {
290
+ "message": f"Successfully indexed documents from folder",
291
+ "chunks_indexed": total_chunks,
292
+ "documents_processed": processed_docs,
293
+ "total_documents": len(docs)
294
+ }
295
+
296
+ # Add warning if some docs failed
297
+ if failed_docs:
298
+ response_detail["warnings"] = {
299
+ "failed_documents": failed_docs,
300
+ "message": f"{len(failed_docs)} document(s) failed to index"
301
+ }
302
+
303
+ return IndexResponse(**response_detail)
304
 
305
  except HTTPException:
306
  raise
307
  except Exception as e:
308
+ raise HTTPException(
309
+ status_code=500,
310
+ detail={
311
+ "error": "Internal Server Error",
312
+ "message": str(e),
313
+ "steps": [
314
+ "1. Check server logs for details",
315
+ "2. Verify all environment variables are set",
316
+ "3. Ensure credentials.json is valid"
317
+ ]
318
+ }
319
+ )
320
 
321
 
322
  @app.post("/index-document", response_model=IndexResponse)
 
464
  except HTTPException:
465
  raise
466
  except Exception as e:
467
+ # Better error handling with rate limit detection
468
  error_msg = str(e)
469
 
470
+ # Check for rate limit errors (GROQ API)
471
+ if "rate_limit" in error_msg.lower() or "429" in error_msg or "too many requests" in error_msg.lower():
472
  raise HTTPException(
473
  status_code=429,
474
+ detail={
475
+ "error": "Rate Limit Exceeded",
476
+ "message": "Too many requests to the AI service. Please wait a moment.",
477
+ "retry_after": "30 seconds",
478
+ "steps": [
479
+ "1. Wait 30 seconds before trying again",
480
+ "2. Reduce the frequency of your requests",
481
+ "3. Consider upgrading your GROQ API plan for higher limits"
482
+ ]
483
+ }
484
  )
485
 
486
+ # Check for API authentication errors
487
+ if "api" in error_msg.lower() or "authentication" in error_msg.lower() or "401" in error_msg:
488
  raise HTTPException(
489
  status_code=503,
490
+ detail={
491
+ "error": "AI Service Unavailable",
492
+ "message": "Cannot connect to AI service. Please check your API key.",
493
+ "steps": [
494
+ "1. Verify GROQ_API_KEY in your .env file",
495
+ "2. Ensure the API key is valid and active",
496
+ "3. Check if your GROQ account has credits",
497
+ "4. Try regenerating your API key at console.groq.com"
498
+ ]
499
+ }
500
  )
501
 
502
+ # Check for embedding/model errors
503
+ if "model" in error_msg.lower() or "embedding" in error_msg.lower():
504
+ raise HTTPException(
505
+ status_code=503,
506
+ detail={
507
+ "error": "Model Service Error",
508
+ "message": "Error generating embeddings or processing text.",
509
+ "steps": [
510
+ "1. The embedding service may be temporarily down",
511
+ "2. Try again in a few moments",
512
+ "3. Check your internet connection"
513
+ ]
514
+ }
515
+ )
516
+
517
+ raise HTTPException(
518
+ status_code=500,
519
+ detail={
520
+ "error": "Chat Processing Error",
521
+ "message": error_msg,
522
+ "steps": [
523
+ "1. Try asking your question differently",
524
+ "2. If problem persists, check server logs",
525
+ "3. Verify all services are running properly"
526
+ ]
527
+ }
528
+ )
529
 
530
 
531
  @app.post("/reindex")
 
576
  except Exception as e:
577
  raise HTTPException(status_code=500, detail=f"Error clearing index: {str(e)}")
578
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
579
 
580
+ # Serve frontend
581
+ app.mount("/static", StaticFiles(directory="frontend"), name="static")