File size: 3,149 Bytes
b444f65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# files_process.py
import pathlib
from typing import Union
from pypdf import PdfReader
from docx import Document

def _read_file_by_ext(p: pathlib.Path) -> str:
    ext = p.suffix.lower()
    if ext == ".txt":
        return p.read_text(encoding="utf-8", errors="ignore")
    if ext == ".docx":
        doc = Document(str(p))
        return "\n".join(paragraph.text for paragraph in doc.paragraphs)
    if ext == ".pdf":
        reader = PdfReader(str(p))
        pages = []
        for page in reader.pages:
            t = page.extract_text()
            if t:
                pages.append(t)
        return "\n".join(pages)
    raise ValueError(f"Unsupported file extension: {ext}. Use .txt / .docx / .pdf.")

def load_input_text(input_arg: Union[str, pathlib.Path]) -> str:
    """
    Load text from a string, or from a file path (.txt, .docx, .pdf).
    - If the argument looks like plain text (contains newlines or is very long), return it as-is.
    - Otherwise, if it resolves to an existing file, read it by extension.
    - On any OSError from filesystem probing (e.g., Errno 36), treat as raw text.
    """
    if input_arg is None:
        raise ValueError("input_arg is required")

    if isinstance(input_arg, pathlib.Path):
        try:
            if input_arg.exists():
                return _read_file_by_ext(input_arg)
            return str(input_arg)
        except OSError:
            return str(input_arg)

    s = str(input_arg)
    if ("\n" in s) or ("\r" in s) or (len(s) > 512):
        return s

    p = pathlib.Path(s)
    try:
        if p.exists():
            return _read_file_by_ext(p)
        return s
    except OSError:
        return s

def prepare_input_arg(text_value: str | None, file_obj) -> str:
    """
    Combine textbox text and a single uploaded file (.txt/.docx/.pdf).
    If both present, concatenate into a temp text file and return its path.
    Compatible with Gradio/Scripts where file_obj may have a .name attribute or be a dict.
    """
    text = (text_value or "").strip()
    if file_obj is None and not text:
        raise ValueError("Provide either text or upload a .txt/.docx/.pdf")

    # If only text
    if file_obj is None:
        return text

    # Best-effort path extraction
    if hasattr(file_obj, "name") and isinstance(file_obj.name, str):
        up_path = pathlib.Path(file_obj.name)
    elif isinstance(file_obj, dict) and "name" in file_obj:
        up_path = pathlib.Path(file_obj["name"])
    else:
        # As a fallback, write bytes if available
        data = getattr(file_obj, "read", None)
        if callable(data):
            content = file_obj.read()
            up_path = pathlib.Path("/tmp/upload.bin")
            up_path.write_bytes(content)
        else:
            raise ValueError("Unsupported uploaded file object; missing .name or .read()")

    if text:
        tmp = pathlib.Path("/tmp/_concat_input.txt")
        tmp.write_text(text + "\n\n", encoding="utf-8")
        appended = load_input_text(str(up_path))
        tmp.write_text(tmp.read_text(encoding="utf-8") + appended, encoding="utf-8")
        return str(tmp)

    return str(up_path)