Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

Medica_DecisionSupportAI / upload_ingest.py

Rajan Sharma

Update upload_ingest.py

023cf3a verified 3 months ago

3.88 kB

	# upload_ingest.py
	from __future__ import annotations
	import os
	import json
	from typing import Dict, List, Any
	import pandas as pd

	# Optional parsers
	try:
	import pdfplumber # noqa: F401
	_HAS_PDFPLUMBER = True
	except Exception:
	_HAS_PDFPLUMBER = False

	def _read_text_file(path: str) -> str:
	try:
	with open(path, "r", encoding="utf-8", errors="ignore") as f:
	return f.read()
	except Exception:
	return ""

	def _read_csv_artifact(path: str) -> Dict[str, Any]:
	# Read a manageable slice, treat everything as string to avoid dtype issues
	df = pd.read_csv(path, nrows=1000, dtype=str, low_memory=False)
	cols = list(df.columns.astype(str))
	# Build a short textual summary to help retrieval too
	preview = df.head(3).to_dict(orient="records")
	text_summary = f"CSV FILE: {os.path.basename(path)}\nCOLUMNS: {', '.join(cols)}\nSAMPLE ROWS: {json.dumps(preview)}"
	return {
	"kind": "csv",
	"name": os.path.basename(path),
	"path": path,
	"columns": cols,
	"n_rows_sampled": len(df),
	"preview_rows": preview,
	"text": text_summary,
	}

	def _read_pdf_text(path: str) -> str:
	# Keep it simple; if pdfplumber missing, skip gracefully
	if not _HAS_PDFPLUMBER:
	return ""
	import pdfplumber
	out = []
	try:
	with pdfplumber.open(path) as pdf:
	for page in pdf.pages[:15]: # cap pages for speed
	t = page.extract_text() or ""
	if t.strip():
	out.append(t)
	except Exception:
	return ""
	return "\n\n".join(out)

	def _read_docx_text(path: str) -> str:
	try:
	import docx
	except Exception:
	return ""
	try:
	doc = docx.Document(path)
	return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
	except Exception:
	return ""

	def _read_image_text(path: str) -> str:
	# Best-effort OCR
	try:
	import pytesseract
	from PIL import Image
	img = Image.open(path)
	return pytesseract.image_to_string(img) or ""
	except Exception:
	return ""

	def extract_text_from_files(paths: List[str]) -> Dict[str, Any]:
	"""
	Returns a dict:
	{
	"chunks": [str, ...], # text chunks for retrieval
	"artifacts": [ { structured meta }, ... ] # e.g., CSV columns
	}
	Backward compatible: callers expecting a list of strings can use ["chunks"].
	"""
	chunks: List[str] = []
	artifacts: List[Dict[str, Any]] = []

	for p in paths or []:
	if not p or not os.path.exists(p):
	continue
	name = os.path.basename(p).lower()
	if name.endswith(".csv"):
	try:
	art = _read_csv_artifact(p)
	artifacts.append(art)
	# also add the textual summary to chunks
	chunks.append(art["text"])
	except Exception:
	# fall back to raw text if any
	chunks.append(_read_text_file(p))
	elif name.endswith(".pdf"):
	txt = _read_pdf_text(p)
	if txt.strip():
	chunks.append(txt)
	elif name.endswith(".docx"):
	txt = _read_docx_text(p)
	if txt.strip():
	chunks.append(txt)
	elif name.endswith((".txt", ".md", ".json")):
	txt = _read_text_file(p)
	if txt.strip():
	chunks.append(txt)
	elif name.endswith((".png", ".jpg", ".jpeg")):
	txt = _read_image_text(p)
	if txt.strip():
	chunks.append(f"IMAGE OCR ({os.path.basename(p)}):\n{txt}")
	else:
	# unknown type: try to read as text
	txt = _read_text_file(p)
	if txt.strip():
	chunks.append(txt)

	return {"chunks": chunks, "artifacts": artifacts}