stefanches7 commited on
Commit
4176f35
·
2 Parent(s): f957d0a 83d2071

Merge branch 'main' of https://github.com/stefanches7/AI-assisted-Neuroimaging-harmonization

Browse files
Files changed (2) hide show
  1. .gitignore +4 -0
  2. cli.py +24 -0
.gitignore CHANGED
@@ -1 +1,5 @@
 
 
 
 
1
  /Non_Bids_Dataset
 
1
+ .env
2
+ __pycache__/
3
+ .venv
4
+ testing_structure.xml
5
  /Non_Bids_Dataset
cli.py CHANGED
@@ -10,11 +10,35 @@ from logging_utils import setup_logging
10
  from agent import BIDSifierAgent
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def _read_optional(path: Optional[str]) -> Optional[str]:
14
  if not path:
15
  return None
16
  if not os.path.isfile(path):
17
  raise FileNotFoundError(f"File not found: {path}")
 
 
 
18
  with open(path, "r", encoding="utf-8", errors="ignore") as f:
19
  return f.read()
20
 
 
10
  from agent import BIDSifierAgent
11
 
12
 
13
+ def _read_pdf(path: str) -> str:
14
+ """Extract text from a PDF file using pypdf."""
15
+ try:
16
+ from pypdf import PdfReader
17
+ except ImportError as e:
18
+ raise RuntimeError(
19
+ "Reading PDFs requires the 'pypdf' package. Install it with: pip install pypdf"
20
+ ) from e
21
+ text_parts: List[str] = []
22
+ with open(path, "rb") as f:
23
+ reader = PdfReader(f)
24
+ for i, page in enumerate(reader.pages):
25
+ try:
26
+ text = page.extract_text() or ""
27
+ except Exception:
28
+ text = ""
29
+ if text.strip():
30
+ # Add lightweight page markers to help the LLM
31
+ text_parts.append(f"\n\n=== Page {i+1} ===\n{text.strip()}")
32
+ return "\n".join(text_parts).strip()
33
+
34
  def _read_optional(path: Optional[str]) -> Optional[str]:
35
  if not path:
36
  return None
37
  if not os.path.isfile(path):
38
  raise FileNotFoundError(f"File not found: {path}")
39
+ ext = os.path.splitext(path)[1].lower()
40
+ if ext == ".pdf":
41
+ return _read_pdf(path)
42
  with open(path, "r", encoding="utf-8", errors="ignore") as f:
43
  return f.read()
44