# files_process.py import pathlib from typing import Union from pypdf import PdfReader from docx import Document def _read_file_by_ext(p: pathlib.Path) -> str: ext = p.suffix.lower() if ext == ".txt": return p.read_text(encoding="utf-8", errors="ignore") if ext == ".docx": doc = Document(str(p)) return "\n".join(paragraph.text for paragraph in doc.paragraphs) if ext == ".pdf": reader = PdfReader(str(p)) pages = [] for page in reader.pages: t = page.extract_text() if t: pages.append(t) return "\n".join(pages) raise ValueError(f"Unsupported file extension: {ext}. Use .txt / .docx / .pdf.") def load_input_text(input_arg: Union[str, pathlib.Path]) -> str: """ Load text from a string, or from a file path (.txt, .docx, .pdf). - If the argument looks like plain text (contains newlines or is very long), return it as-is. - Otherwise, if it resolves to an existing file, read it by extension. - On any OSError from filesystem probing (e.g., Errno 36), treat as raw text. """ if input_arg is None: raise ValueError("input_arg is required") if isinstance(input_arg, pathlib.Path): try: if input_arg.exists(): return _read_file_by_ext(input_arg) return str(input_arg) except OSError: return str(input_arg) s = str(input_arg) if ("\n" in s) or ("\r" in s) or (len(s) > 512): return s p = pathlib.Path(s) try: if p.exists(): return _read_file_by_ext(p) return s except OSError: return s def prepare_input_arg(text_value: str | None, file_obj) -> str: """ Combine textbox text and a single uploaded file (.txt/.docx/.pdf). If both present, concatenate into a temp text file and return its path. Compatible with Gradio/Scripts where file_obj may have a .name attribute or be a dict. """ text = (text_value or "").strip() if file_obj is None and not text: raise ValueError("Provide either text or upload a .txt/.docx/.pdf") # If only text if file_obj is None: return text # Best-effort path extraction if hasattr(file_obj, "name") and isinstance(file_obj.name, str): up_path = pathlib.Path(file_obj.name) elif isinstance(file_obj, dict) and "name" in file_obj: up_path = pathlib.Path(file_obj["name"]) else: # As a fallback, write bytes if available data = getattr(file_obj, "read", None) if callable(data): content = file_obj.read() up_path = pathlib.Path("/tmp/upload.bin") up_path.write_bytes(content) else: raise ValueError("Unsupported uploaded file object; missing .name or .read()") if text: tmp = pathlib.Path("/tmp/_concat_input.txt") tmp.write_text(text + "\n\n", encoding="utf-8") appended = load_input_text(str(up_path)) tmp.write_text(tmp.read_text(encoding="utf-8") + appended, encoding="utf-8") return str(tmp) return str(up_path)