Rajan Sharma commited on
Commit
023cf3a
·
verified ·
1 Parent(s): f20b5e0

Update upload_ingest.py

Browse files
Files changed (1) hide show
  1. upload_ingest.py +112 -80
upload_ingest.py CHANGED
@@ -1,92 +1,124 @@
1
- import os, mimetypes
2
- from typing import List, Tuple
3
- import pdfplumber
4
- from docx import Document as DocxDocument
5
- from PIL import Image
6
- import pytesseract
7
 
8
- from settings import ALLOWED_EXT, ALLOWED_MIME, MAX_UPLOAD_MB, ENABLE_AV_SCAN, CLAMD_UNIX_SOCKET, CLAMD_NETWORK
9
- from privacy import redact_text
 
 
 
 
10
 
11
- # --- Optional AV scan (clamd) ---
12
- def _clamd_scan(path: str) -> bool:
13
- if not ENABLE_AV_SCAN:
14
- return True
15
  try:
16
- import clamd
17
- cd = None
18
- if CLAMD_UNIX_SOCKET:
19
- cd = clamd.ClamdUnixSocket(CLAMD_UNIX_SOCKET)
20
- elif CLAMD_NETWORK:
21
- host, port = CLAMD_NETWORK
22
- cd = clamd.ClamdNetworkSocket(host, port)
23
- if not cd:
24
- return True
25
- res = cd.scan(path)
26
- # Expected: {'/path/file': ('OK', 'OK')} or ('FOUND','Eicar-Test-Signature')
27
- verdict = next(iter(res.values()))[0] if isinstance(res, dict) else "OK"
28
- return verdict == "OK"
29
  except Exception:
30
- # If AV unavailable, fail open by default (configurable)
31
- return True
32
-
33
- def _check_allowed(path: str) -> tuple[bool, str]:
34
- ext = os.path.splitext(path.lower())[1]
35
- if ext not in ALLOWED_EXT:
36
- return False, f"Extension {ext} not allowed."
37
- mime, _ = mimetypes.guess_type(path)
38
- if mime not in ALLOWED_MIME:
39
- return False, f"MIME {mime} not allowed."
40
- size_mb = os.path.getsize(path) / (1024 * 1024)
41
- if size_mb > MAX_UPLOAD_MB:
42
- return False, f"File too large ({size_mb:.1f}MB > {MAX_UPLOAD_MB}MB)."
43
- if not _clamd_scan(path):
44
- return False, "Antivirus scan failed."
45
- return True, "ok"
46
 
47
- def _read_text_file(path: str) -> str:
48
- with open(path, "r", encoding="utf-8", errors="ignore") as f:
49
- return f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- def _read_docx(path: str) -> str:
52
- doc = DocxDocument(path)
53
- return "\n".join([p.text for p in doc.paragraphs])
54
-
55
- def _read_pdf(path: str) -> str:
56
  out = []
57
- with pdfplumber.open(path) as pdf:
58
- for p in pdf.pages:
59
- out.append(p.extract_text() or "")
60
- return "\n".join(out)
 
 
 
 
 
61
 
62
- def _read_image_ocr(path: str) -> str:
63
- img = Image.open(path)
64
- return pytesseract.image_to_string(img)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- def extract_text_from_files(filepaths: List[str]) -> List[Tuple[str, str]]:
67
  """
68
- Returns a list of (safe_name, redacted_text) for approved files.
 
 
 
 
 
69
  """
70
- results: List[Tuple[str, str]] = []
71
- for fp in filepaths or []:
72
- ok, reason = _check_allowed(fp)
73
- if not ok:
74
- # skip silently or raise/log upstream
75
- continue
76
- ext = os.path.splitext(fp.lower())[1]
77
- try:
78
- if ext in {".txt", ".md", ".csv"}:
79
- txt = _read_text_file(fp)
80
- elif ext == ".docx":
81
- txt = _read_docx(fp)
82
- elif ext == ".pdf":
83
- txt = _read_pdf(fp)
84
- elif ext in {".png", ".jpg", ".jpeg", ".webp"}:
85
- txt = _read_image_ocr(fp)
86
- else:
87
- txt = ""
88
- if txt and txt.strip():
89
- results.append((os.path.basename(fp), redact_text(txt)))
90
- except Exception:
91
  continue
92
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # upload_ingest.py
2
+ from __future__ import annotations
3
+ import os
4
+ import json
5
+ from typing import Dict, List, Any
6
+ import pandas as pd
7
 
8
+ # Optional parsers
9
+ try:
10
+ import pdfplumber # noqa: F401
11
+ _HAS_PDFPLUMBER = True
12
+ except Exception:
13
+ _HAS_PDFPLUMBER = False
14
 
15
+ def _read_text_file(path: str) -> str:
 
 
 
16
  try:
17
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
18
+ return f.read()
 
 
 
 
 
 
 
 
 
 
 
19
  except Exception:
20
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ def _read_csv_artifact(path: str) -> Dict[str, Any]:
23
+ # Read a manageable slice, treat everything as string to avoid dtype issues
24
+ df = pd.read_csv(path, nrows=1000, dtype=str, low_memory=False)
25
+ cols = list(df.columns.astype(str))
26
+ # Build a short textual summary to help retrieval too
27
+ preview = df.head(3).to_dict(orient="records")
28
+ text_summary = f"CSV FILE: {os.path.basename(path)}\nCOLUMNS: {', '.join(cols)}\nSAMPLE ROWS: {json.dumps(preview)}"
29
+ return {
30
+ "kind": "csv",
31
+ "name": os.path.basename(path),
32
+ "path": path,
33
+ "columns": cols,
34
+ "n_rows_sampled": len(df),
35
+ "preview_rows": preview,
36
+ "text": text_summary,
37
+ }
38
 
39
+ def _read_pdf_text(path: str) -> str:
40
+ # Keep it simple; if pdfplumber missing, skip gracefully
41
+ if not _HAS_PDFPLUMBER:
42
+ return ""
43
+ import pdfplumber
44
  out = []
45
+ try:
46
+ with pdfplumber.open(path) as pdf:
47
+ for page in pdf.pages[:15]: # cap pages for speed
48
+ t = page.extract_text() or ""
49
+ if t.strip():
50
+ out.append(t)
51
+ except Exception:
52
+ return ""
53
+ return "\n\n".join(out)
54
 
55
+ def _read_docx_text(path: str) -> str:
56
+ try:
57
+ import docx
58
+ except Exception:
59
+ return ""
60
+ try:
61
+ doc = docx.Document(path)
62
+ return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
63
+ except Exception:
64
+ return ""
65
+
66
+ def _read_image_text(path: str) -> str:
67
+ # Best-effort OCR
68
+ try:
69
+ import pytesseract
70
+ from PIL import Image
71
+ img = Image.open(path)
72
+ return pytesseract.image_to_string(img) or ""
73
+ except Exception:
74
+ return ""
75
 
76
+ def extract_text_from_files(paths: List[str]) -> Dict[str, Any]:
77
  """
78
+ Returns a dict:
79
+ {
80
+ "chunks": [str, ...], # text chunks for retrieval
81
+ "artifacts": [ { structured meta }, ... ] # e.g., CSV columns
82
+ }
83
+ Backward compatible: callers expecting a list of strings can use ["chunks"].
84
  """
85
+ chunks: List[str] = []
86
+ artifacts: List[Dict[str, Any]] = []
87
+
88
+ for p in paths or []:
89
+ if not p or not os.path.exists(p):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  continue
91
+ name = os.path.basename(p).lower()
92
+ if name.endswith(".csv"):
93
+ try:
94
+ art = _read_csv_artifact(p)
95
+ artifacts.append(art)
96
+ # also add the textual summary to chunks
97
+ chunks.append(art["text"])
98
+ except Exception:
99
+ # fall back to raw text if any
100
+ chunks.append(_read_text_file(p))
101
+ elif name.endswith(".pdf"):
102
+ txt = _read_pdf_text(p)
103
+ if txt.strip():
104
+ chunks.append(txt)
105
+ elif name.endswith(".docx"):
106
+ txt = _read_docx_text(p)
107
+ if txt.strip():
108
+ chunks.append(txt)
109
+ elif name.endswith((".txt", ".md", ".json")):
110
+ txt = _read_text_file(p)
111
+ if txt.strip():
112
+ chunks.append(txt)
113
+ elif name.endswith((".png", ".jpg", ".jpeg")):
114
+ txt = _read_image_text(p)
115
+ if txt.strip():
116
+ chunks.append(f"IMAGE OCR ({os.path.basename(p)}):\n{txt}")
117
+ else:
118
+ # unknown type: try to read as text
119
+ txt = _read_text_file(p)
120
+ if txt.strip():
121
+ chunks.append(txt)
122
+
123
+ return {"chunks": chunks, "artifacts": artifacts}
124
+