File size: 23,381 Bytes
a4b70d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
from __future__ import annotations

import re
import os
import json
from pathlib import Path
from typing import Iterator, Optional, AsyncIterator
from aiohttp import ClientSession, ClientError, ClientResponse, ClientTimeout
import urllib.parse
import time
import zipfile
import asyncio
import hashlib
import base64
import tempfile
import shutil

try:
    import PyPDF2
    from PyPDF2.errors import PdfReadError
    has_pypdf2 = True
except ImportError:
    has_pypdf2 = False
try:
    import pdfplumber
    has_pdfplumber = True
except ImportError:
    has_pdfplumber = False
try:
    from pdfminer.high_level import extract_text
    has_pdfminer = True
except ImportError:
    has_pdfminer = False
try:
    from docx import Document
    has_docx = True
except ImportError:
    has_docx = False
try:
    import docx2txt
    has_docx2txt = True
except ImportError:
    has_docx2txt = False
try:
    from odf.opendocument import load
    from odf.text import P
    has_odfpy = True
except ImportError:
    has_odfpy = False
try:
    import ebooklib
    from ebooklib import epub
    has_ebooklib = True
except ImportError:
    has_ebooklib = False
try:
    import pandas as pd
    has_openpyxl = True
except ImportError:
    has_openpyxl = False
try:
    import spacy
    has_spacy = True
except:
    has_spacy = False
try:
    from bs4 import BeautifulSoup
    has_beautifulsoup4 = True
except ImportError:
    has_beautifulsoup4 = False
try:
    from markitdown import MarkItDown
    has_markitdown = True
except ImportError:
    has_markitdown = False

from .web_search import scrape_text
from ..files import secure_filename, get_bucket_dir
from ..image import is_allowed_extension
from ..requests.aiohttp import get_connector
from ..providers.asyncio import to_sync_generator
from ..errors import MissingRequirementsError
from .. import debug

PLAIN_FILE_EXTENSIONS = ["txt", "xml", "json", "js", "har", "sh", "py", "php", "css", "yaml", "sql", "log", "csv", "twig", "md", "arc"]
PLAIN_CACHE = "plain.cache"
DOWNLOADS_FILE = "downloads.json"
FILE_LIST = "files.txt"

def supports_filename(filename: str):
    if filename.endswith(".pdf"):
        if has_pypdf2:
            return True
        elif has_pdfplumber:
            return True
        elif has_pdfminer:
            return True
        raise MissingRequirementsError(f'Install "pypdf2" requirements | pip install -U g4f[files]')
    elif filename.endswith(".docx"):
        if has_docx:
            return True
        elif has_docx2txt:
            return True
        raise MissingRequirementsError(f'Install "docx" requirements | pip install -U g4f[files]')
    elif has_odfpy and filename.endswith(".odt"):
        return True
    elif has_ebooklib and filename.endswith(".epub"):
        return True
    elif has_openpyxl and filename.endswith(".xlsx"):
        return True
    elif filename.endswith(".html"):
        if not has_beautifulsoup4:
            raise MissingRequirementsError(f'Install "beautifulsoup4" requirements | pip install -U g4f[files]')
        return True
    elif filename.endswith(".zip"):
        return True
    elif filename.endswith("package-lock.json") and filename != FILE_LIST:
        return False
    else:
        extension = os.path.splitext(filename)[1][1:]
        if extension in PLAIN_FILE_EXTENSIONS:
            return True
    return False

def spacy_refine_chunks(source_iterator):
    if not has_spacy:
        raise MissingRequirementsError(f'Install "spacy" requirements | pip install -U g4f[files]')

    nlp = spacy.load("en_core_web_sm")
    for page in source_iterator:
        doc = nlp(page)
        #for chunk in doc.noun_chunks:
        #    yield " ".join([token.lemma_ for token in chunk if not token.is_stop])
        # for token in doc:
        #     if not token.is_space:
        #         yield token.lemma_.lower()
        #         yield " "
        sentences = list(doc.sents)
        summary = sorted(sentences, key=lambda x: len(x.text), reverse=True)[:2]
        for sent in summary:
            yield sent.text

def get_filenames(bucket_dir: Path):
    files = bucket_dir / FILE_LIST
    if files.exists():
        with files.open('r') as f:
            return [filename.strip() for filename in f.readlines()]
    return []

def stream_read_files(bucket_dir: Path, filenames: list[str], delete_files: bool = False) -> Iterator[str]:
    for filename in filenames:
        if filename.startswith(DOWNLOADS_FILE):
            continue
        file_path: Path = bucket_dir / filename
        if not file_path.exists() or file_path.lstat().st_size <= 0:
            continue
        extension = os.path.splitext(filename)[1][1:]
        if filename.endswith(".zip"):
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                zip_ref.extractall(bucket_dir)
                try:
                    yield from stream_read_files(bucket_dir, [f for f in zip_ref.namelist() if supports_filename(f)], delete_files)
                except zipfile.BadZipFile:
                    pass
                finally:
                    if delete_files:
                        for unlink in zip_ref.namelist()[::-1]:
                            filepath = os.path.join(bucket_dir, unlink)
                            if os.path.exists(filepath):
                                if os.path.isdir(filepath):
                                    os.rmdir(filepath)
                                else:
                                    os.unlink(filepath)
            continue
        yield f"<!-- File: {filename} -->\n"
        if has_pypdf2 and filename.endswith(".pdf"):
            try:
                reader = PyPDF2.PdfReader(file_path)
                for page_num in range(len(reader.pages)):
                    page = reader.pages[page_num]
                    yield page.extract_text()
            except PdfReadError:
                continue
        if has_pdfplumber and filename.endswith(".pdf"):
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    yield page.extract_text()
        if has_pdfminer and filename.endswith(".pdf"):
            yield extract_text(file_path)
        elif has_docx and filename.endswith(".docx"):
            doc = Document(file_path)
            for para in doc.paragraphs:
                yield para.text
        elif has_docx2txt and filename.endswith(".docx"):
            yield docx2txt.process(file_path)
        elif has_odfpy and filename.endswith(".odt"):
            textdoc = load(file_path)
            allparas = textdoc.getElementsByType(P)
            for p in allparas:
                yield p.firstChild.data if p.firstChild else ""
        elif has_ebooklib and filename.endswith(".epub"):
            book = epub.read_epub(file_path)
            for doc_item in book.get_items():
                if doc_item.get_type() == ebooklib.ITEM_DOCUMENT:
                    yield doc_item.get_content().decode(errors='ignore')
        elif has_openpyxl and filename.endswith(".xlsx"):
            df = pd.read_excel(file_path)
            for row in df.itertuples(index=False):
                yield " ".join(str(cell) for cell in row)
        elif has_beautifulsoup4 and filename.endswith(".html"):
            yield from scrape_text(file_path.read_text(errors="ignore"))
        elif extension in PLAIN_FILE_EXTENSIONS:
            yield file_path.read_text(errors="ignore").strip()
        yield f"\n<-- End -->\n\n"

def cache_stream(stream: Iterator[str], bucket_dir: Path) -> Iterator[str]:
    cache_file = bucket_dir / PLAIN_CACHE
    tmp_file = bucket_dir / f"{PLAIN_CACHE}.{time.time()}.tmp"
    if cache_file.exists():
        for chunk in read_path_chunked(cache_file):
            yield chunk
        return
    with open(tmp_file, "wb") as f:
        for chunk in stream:
            f.write(chunk.encode(errors="replace"))
            yield chunk
    tmp_file.rename(cache_file)

def is_complete(data: str):
    return data.endswith("\n```\n\n") and data.count("```") % 2 == 0

def read_path_chunked(path: Path):
    with path.open("r", encoding='utf-8') as f:
        current_chunk_size = 0
        buffer = ""
        for line in f:
            current_chunk_size += len(line.encode('utf-8'))
            buffer += line
            if current_chunk_size >= 4096:
                if is_complete(buffer) or current_chunk_size >= 8192:
                    yield buffer
                    buffer = ""
                    current_chunk_size = 0
        if current_chunk_size > 0:
            yield buffer

def read_bucket(bucket_dir: Path):
    bucket_dir = Path(bucket_dir)
    cache_file = bucket_dir / PLAIN_CACHE
    spacy_file = bucket_dir / f"spacy_0001.cache"
    if not spacy_file.is_file() and cache_file.is_file():
        yield cache_file.read_text(errors="replace")
    for idx in range(1, 1000):
        spacy_file = bucket_dir / f"spacy_{idx:04d}.cache"
        plain_file = bucket_dir / f"plain_{idx:04d}.cache"
        if spacy_file.is_file():
            yield spacy_file.read_text(errors="replace")
        elif plain_file.is_file():
            yield plain_file.read_text(errors="replace")
        else:
            break

def stream_read_parts_and_refine(bucket_dir: Path, delete_files: bool = False) -> Iterator[str]:
    cache_file = bucket_dir / PLAIN_CACHE
    space_file = Path(bucket_dir) / f"spacy_0001.cache"
    part_one = bucket_dir / f"plain_0001.cache"
    if not space_file.exists() and not part_one.exists() and cache_file.exists():
        split_file_by_size_and_newline(cache_file, bucket_dir)
    for idx in range(1, 1000):
        part = bucket_dir / f"plain_{idx:04d}.cache"
        tmp_file = Path(bucket_dir) / f"spacy_{idx:04d}.{time.time()}.tmp"
        cache_file = Path(bucket_dir) / f"spacy_{idx:04d}.cache"
        if cache_file.exists():
            with open(cache_file, "r") as f:
                yield f.read(errors="replace")
            continue
        if not part.exists():
            break
        with tmp_file.open("w") as f:
            for chunk in spacy_refine_chunks(read_path_chunked(part)):
                f.write(chunk)
                yield chunk
        tmp_file.rename(cache_file)
        if delete_files:
            part.unlink()

def split_file_by_size_and_newline(input_filename, output_dir, chunk_size_bytes=1024*1024): # 1MB
    """Splits a file into chunks of approximately chunk_size_bytes, splitting only at newline characters.

    Args:
        input_filename: Path to the input file.
        output_prefix: Prefix for the output files (e.g., 'output_part_').
        chunk_size_bytes: Desired size of each chunk in bytes.
    """
    split_filename = os.path.splitext(os.path.basename(input_filename))
    output_prefix = os.path.join(output_dir, split_filename[0] + "_")

    with open(input_filename, 'r', encoding='utf-8') as infile:
        chunk_num = 1
        current_chunk = ""
        current_chunk_size = 0

        for line in infile:
            current_chunk += line
            current_chunk_size += len(line.encode('utf-8'))

            if current_chunk_size >= chunk_size_bytes:
                if is_complete(current_chunk) or current_chunk_size >= chunk_size_bytes * 2:
                    output_filename = f"{output_prefix}{chunk_num:04d}{split_filename[1]}"
                    with open(output_filename, 'w', encoding='utf-8') as outfile:
                        outfile.write(current_chunk)
                    current_chunk = ""
                    current_chunk_size = 0
                    chunk_num += 1

        # Write the last chunk
        if current_chunk:
            output_filename = f"{output_prefix}{chunk_num:04d}{split_filename[1]}"
            with open(output_filename, 'w', encoding='utf-8') as outfile:
                outfile.write(current_chunk)

def get_filename_from_url(url: str, extension: str = ".md") -> str:
    parsed_url = urllib.parse.urlparse(url)
    sha256_hash = hashlib.sha256(url.encode()).digest()
    base32_encoded = base64.b32encode(sha256_hash).decode()
    url_hash = base32_encoded[:24].lower()
    return f"{parsed_url.netloc}+{parsed_url.path[1:].replace('/', '_')}+{url_hash}{extension}"

async def get_filename(response: ClientResponse) -> str:
    """
    Attempts to extract a filename from an aiohttp response. Prioritizes Content-Disposition, then URL.

    Args:
        response: The aiohttp ClientResponse object.

    Returns:
        The filename as a string, or None if it cannot be determined.
    """

    content_disposition = response.headers.get('Content-Disposition')
    if content_disposition:
        try:
            filename = content_disposition.split('filename=')[1].strip('"')
            if filename:
                return secure_filename(filename)
        except IndexError:
            pass

    content_type = response.headers.get('Content-Type')
    url = str(response.url)
    if content_type and url:
        extension = await get_file_extension(response)
        if extension:
            return get_filename_from_url(url, extension)

    return None

async def get_file_extension(response: ClientResponse):
    """
    Attempts to determine the file extension from an aiohttp response.  Improved to handle more types.

    Args:
        response: The aiohttp ClientResponse object.

    Returns:
        The file extension (e.g., ".html", ".json", ".pdf", ".zip", ".md", ".txt") as a string,
        or None if it cannot be determined.
    """

    content_type = response.headers.get('Content-Type')
    if content_type:
        if "html" in content_type.lower():
            return ".html"
        elif "json" in content_type.lower():
            return ".json"
        elif "pdf" in content_type.lower():
            return ".pdf"
        elif "zip" in content_type.lower():
            return ".zip"
        elif "text/plain" in content_type.lower():
            return ".txt"
        elif "markdown" in content_type.lower():
            return ".md"

    url = str(response.url)
    if url:
        return Path(url).suffix.lower()

    return None

def read_links(html: str, base: str) -> set[str]:
    soup = BeautifulSoup(html, "html.parser")
    for selector in [
            "main",
            ".main-content-wrapper",
            ".main-content",
            ".emt-container-inner",
            ".content-wrapper",
            "#content",
            "#mainContent",
        ]:
        select = soup.select_one(selector)
        if select:
            soup = select
            break
    urls = []
    for link in soup.select("a"):
        if "rel" not in link.attrs or "nofollow" not in link.attrs["rel"]:
            url = link.attrs.get("href")
            if url and url.startswith("https://") or url.startswith("/"):
                urls.append(url.split("#")[0])
    return set([urllib.parse.urljoin(base, link) for link in urls])

async def download_urls(
    bucket_dir: Path,
    urls: list[str],
    max_depth: int = 0,
    loading_urls: set[str] = set(),
    lock: asyncio.Lock = None,
    delay: int = 3,
    new_urls: list[str] = list(),
    group_size: int = 5,
    timeout: int = 10,
    proxy: Optional[str] = None
) -> AsyncIterator[str]:
    if lock is None:
        lock = asyncio.Lock()
    md = MarkItDown()
    async with ClientSession(
        connector=get_connector(proxy=proxy),
        timeout=ClientTimeout(timeout)
    ) as session:
        async def download_url(url: str, max_depth: int) -> str:
            text_content = None
            if has_markitdown:
                try:
                    text_content = md.convert(url).text_content
                    if text_content:
                        filename = get_filename_from_url(url)
                        target = bucket_dir / filename
                        text_content = f"{text_content.strip()}\n\nSource: {url}\n"
                        target.write_text(text_content, errors="replace")
                        return filename
                except Exception as e:
                    debug.log(f"Failed to convert URL to text: {type(e).__name__}: {e}")
            try:
                async with session.get(url) as response:
                    response.raise_for_status()
                    filename = await get_filename(response)
                    if not filename:
                        debug.log(f"Failed to get filename for {url}")
                        return None
                    if not is_allowed_extension(filename) and not supports_filename(filename) or filename == DOWNLOADS_FILE:
                        return None
                    if filename.endswith(".html") and max_depth > 0:
                        add_urls = read_links(await response.text(), str(response.url))
                        if add_urls:
                            async with lock:
                                add_urls = [add_url for add_url in add_urls if add_url not in loading_urls]
                                [loading_urls.add(add_url) for add_url in add_urls]
                                [new_urls.append(add_url) for add_url in add_urls if add_url not in new_urls]
                    if is_allowed_extension(filename):
                        target = bucket_dir / "media" / filename
                        target.parent.mkdir(parents=True, exist_ok=True)
                    else:
                        target = bucket_dir / filename
                    with target.open("wb") as f:
                        async for chunk in response.content.iter_any():
                            if filename.endswith(".html") and b'<link rel="canonical"' not in chunk:
                                f.write(chunk.replace(b'</head>', f'<link rel="canonical" href="{response.url}">\n</head>'.encode()))
                            else:
                                f.write(chunk)
                    return filename
            except (ClientError, asyncio.TimeoutError) as e:
                debug.log(f"Download failed: {e.__class__.__name__}: {e}")
            return None
        for filename in await asyncio.gather(*[download_url(url, max_depth) for url in urls]):
            if filename:
                yield filename
            else:
                await asyncio.sleep(delay)
        while new_urls:
            next_urls = list()
            for i in range(0, len(new_urls), group_size):
                chunked_urls = new_urls[i:i + group_size]
                async for filename in download_urls(bucket_dir, chunked_urls, max_depth - 1, loading_urls, lock, delay + 1, next_urls):
                    yield filename
                await asyncio.sleep(delay)
            new_urls = next_urls

def get_downloads_urls(bucket_dir: Path, delete_files: bool = False) -> Iterator[str]:
    download_file = bucket_dir / DOWNLOADS_FILE
    if download_file.exists():
        with download_file.open('r') as f:
            data = json.load(f) 
        if delete_files:
            download_file.unlink()
        if isinstance(data, list):
            for item in data:
                if "url" in item:
                    yield {"urls": [item.pop("url")], **item}
                elif "urls" in item:
                    yield item

def read_and_download_urls(bucket_dir: Path, delete_files: bool = False, event_stream: bool = False) -> Iterator[str]:
    urls = get_downloads_urls(bucket_dir, delete_files)
    if urls:
        count = 0
        with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f:
            for url in urls:
                for filename in to_sync_generator(download_urls(bucket_dir, **url)):
                    f.write(f"{filename}\n")
                    if event_stream:
                        count += 1
                        yield f'data: {json.dumps({"action": "download", "count": count})}\n\n'

async def async_read_and_download_urls(bucket_dir: Path, delete_files: bool = False, event_stream: bool = False) -> AsyncIterator[str]:
    urls = get_downloads_urls(bucket_dir, delete_files)
    if urls:
        count = 0
        with open(os.path.join(bucket_dir, FILE_LIST), 'a') as f:
            for url in urls:
                async for filename in download_urls(bucket_dir, **url):
                    f.write(f"{filename}\n")
                    if event_stream:
                        count += 1
                        yield f'data: {json.dumps({"action": "download", "count": count})}\n\n'

def stream_chunks(bucket_dir: Path, delete_files: bool = False, refine_chunks_with_spacy: bool = False, event_stream: bool = False) -> Iterator[str]:
    size = 0
    if refine_chunks_with_spacy:
        for chunk in stream_read_parts_and_refine(bucket_dir, delete_files):
            if event_stream:
                size += len(chunk.encode())
                yield f'data: {json.dumps({"action": "refine", "size": size})}\n\n'
            else:
                yield chunk
    else:
        streaming = stream_read_files(bucket_dir, get_filenames(bucket_dir), delete_files)
        streaming = cache_stream(streaming, bucket_dir)
        for chunk in streaming:
            if event_stream:
                size += len(chunk.encode())
                yield f'data: {json.dumps({"action": "load", "size": size})}\n\n'
            else:
                yield chunk
        files_txt = os.path.join(bucket_dir, FILE_LIST)
        if os.path.exists(files_txt):
            for filename in get_filenames(bucket_dir):
                if is_allowed_extension(filename):
                    yield f'data: {json.dumps({"action": "media", "filename": filename})}\n\n'
                if delete_files and os.path.exists(os.path.join(bucket_dir, filename)):
                    os.remove(os.path.join(bucket_dir, filename))
            os.remove(files_txt)
            if event_stream:
                yield f'data: {json.dumps({"action": "delete_files"})}\n\n'
    if event_stream:
        yield f'data: {json.dumps({"action": "done", "size": size})}\n\n'

def get_streaming(bucket_dir: str, delete_files = False, refine_chunks_with_spacy = False, event_stream: bool = False) -> Iterator[str]:
    bucket_dir = Path(bucket_dir)
    bucket_dir.mkdir(parents=True, exist_ok=True)
    try:
        yield from read_and_download_urls(bucket_dir, delete_files, event_stream)
        yield from stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream)
    except Exception as e:
        if event_stream:
            yield f'data: {json.dumps({"error": {"message": str(e)}})}\n\n'
        raise e

async def get_async_streaming(bucket_dir: str, delete_files = False, refine_chunks_with_spacy = False, event_stream: bool = False) -> Iterator[str]:
    bucket_dir = Path(bucket_dir)
    bucket_dir.mkdir(parents=True, exist_ok=True)
    try:
        async for chunk in async_read_and_download_urls(bucket_dir, delete_files, event_stream):
            yield chunk
        for chunk in stream_chunks(bucket_dir, delete_files, refine_chunks_with_spacy, event_stream):
            yield chunk
    except Exception as e:
        if event_stream:
            yield f'data: {json.dumps({"error": {"message": str(e)}})}\n\n'
        raise e

def get_tempfile(file, suffix: str = None):
    copyfile = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
    shutil.copyfileobj(file, copyfile)
    copyfile.close()
    file.close()
    return copyfile.name