File size: 6,512 Bytes
59eb043
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
"""Download datasets and embeddings from Google Drive on startup"""
import os
import shutil
import subprocess
import sys
from pathlib import Path

# Google Drive folder ID (extracted from the share link)
GDRIVE_FOLDER_ID = "1tvoY4Ks3elgRgC81uRsZRDhDcclmu5hO"

def install_gdown():
    """Install gdown if not present"""
    try:
        import gdown
    except ImportError:
        print("Installing gdown...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "gdown", "-q"])
        import gdown
    return gdown

def download_folder_from_gdrive(folder_id: str, output_dir: str):
    """Download entire folder from GDrive"""
    gdown = install_gdown()
    
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    print(f"Downloading from GDrive folder {folder_id} to {output_dir}...")
    
    try:
        url = f"https://drive.google.com/drive/folders/{folder_id}"
        gdown.download_folder(url, output=output_dir, quiet=False, remaining_ok=True)
        print(f"Download complete: {output_dir}")
        return True
    except Exception as e:
        print(f"Error downloading: {e}")
        import traceback
        traceback.print_exc()
        return False

def organize_downloaded_files(download_dir: Path, backend_dir: Path):
    """Move downloaded files to correct locations"""
    print(f"\n{'='*50}")
    print("ORGANIZING DOWNLOADED FILES")
    print(f"{'='*50}")
    print(f"Source: {download_dir}")
    print(f"Destination: {backend_dir}")
    
    if not download_dir.exists():
        print(f"ERROR: Download directory {download_dir} does not exist!")
        return False
    
    # List all downloaded items
    items = list(download_dir.iterdir())
    print(f"Found {len(items)} items in download folder:")
    
    for item in items:
        print(f"\n  Processing: {item.name} (is_dir={item.is_dir()})")
        
        try:
            # Move CSV files to dataset folder
            if item.suffix.lower() == '.csv':
                dataset_dir = backend_dir / "dataset"
                dataset_dir.mkdir(parents=True, exist_ok=True)
                dest = dataset_dir / item.name
                print(f"    Moving CSV to {dest}")
                shutil.move(str(item), str(dest))
                print(f"    ✓ Moved successfully")
            
            # Move chroma_db folder
            elif item.name == 'chroma_db' and item.is_dir():
                dest = backend_dir / "chroma_db"
                if dest.exists():
                    print(f"    Removing existing {dest}")
                    shutil.rmtree(dest)
                print(f"    Moving chroma_db to {dest}")
                shutil.move(str(item), str(dest))
                print(f"    ✓ Moved successfully")
            
            # Move manga_chroma_db folder
            elif item.name == 'manga_chroma_db' and item.is_dir():
                dest = backend_dir / "manga_chroma_db"
                if dest.exists():
                    print(f"    Removing existing {dest}")
                    shutil.rmtree(dest)
                print(f"    Moving manga_chroma_db to {dest}")
                shutil.move(str(item), str(dest))
                print(f"    ✓ Moved successfully")
            
            # Handle nested directories (GDrive sometimes creates nested folders)
            elif item.is_dir():
                print(f"    Recursively processing subdirectory: {item.name}")
                organize_downloaded_files(item, backend_dir)
        except Exception as e:
            print(f"    ERROR moving {item.name}: {e}")
            import traceback
            traceback.print_exc()
    
    return True

def verify_data(backend_dir: Path):
    """Verify all required data files exist"""
    print(f"\n{'='*50}")
    print("VERIFYING DATA FILES")
    print(f"{'='*50}")
    
    dataset_path = backend_dir / "dataset" / "anime.csv"
    chroma_path = backend_dir / "chroma_db"
    manga_chroma_path = backend_dir / "manga_chroma_db"
    
    print(f"Dataset (anime.csv): {dataset_path.exists()} - {dataset_path}")
    print(f"ChromaDB: {chroma_path.exists()} - {chroma_path}")
    print(f"Manga ChromaDB: {manga_chroma_path.exists()} - {manga_chroma_path}")
    
    if chroma_path.exists():
        print(f"  ChromaDB contents: {list(chroma_path.iterdir())}")
    if manga_chroma_path.exists():
        print(f"  Manga ChromaDB contents: {list(manga_chroma_path.iterdir())}")
    
    return dataset_path.exists() and chroma_path.exists()

def setup_data():
    """Download all required data files"""
    backend_dir = Path(__file__).parent
    
    print(f"\n{'='*50}")
    print("ANIVERSE DATA SETUP")
    print(f"{'='*50}")
    print(f"Backend directory: {backend_dir}")
    
    # Check if data already exists
    dataset_dir = backend_dir / "dataset"
    chroma_dir = backend_dir / "chroma_db"
    
    dataset_exists = (dataset_dir / "anime.csv").exists()
    chroma_exists = chroma_dir.exists() and any(chroma_dir.iterdir()) if chroma_dir.exists() else False
    
    print(f"Dataset exists: {dataset_exists}")
    print(f"ChromaDB exists: {chroma_exists}")
    
    if dataset_exists and chroma_exists:
        print("All data files present, skipping download.")
        verify_data(backend_dir)
        return True
    
    # Download from GDrive
    print(f"\n{'='*50}")
    print("DOWNLOADING DATA FROM GOOGLE DRIVE")
    print(f"{'='*50}")
    
    download_dir = backend_dir / "data_download"
    success = download_folder_from_gdrive(GDRIVE_FOLDER_ID, str(download_dir))
    
    if success:
        organize_downloaded_files(download_dir, backend_dir)
        
        # Cleanup download folder
        if download_dir.exists():
            print(f"\nCleaning up download folder: {download_dir}")
            try:
                shutil.rmtree(download_dir)
                print("✓ Cleanup complete")
            except Exception as e:
                print(f"Warning: Could not cleanup: {e}")
        
        # Verify the data
        verify_data(backend_dir)
    else:
        print("\n" + "!"*50)
        print("WARNING: Failed to download data from GDrive!")
        print("The server may not function correctly without data files.")
        print("!"*50)
    
    return success

if __name__ == "__main__":
    setup_data()