Ekain Arrieta commited on
Commit
83d2071
·
2 Parent(s): ba7c80f 57f68aa

Merge pull request #9 from stefanches7/testing

Browse files
Files changed (2) hide show
  1. .gitignore +4 -0
  2. cli.py +24 -0
.gitignore CHANGED
@@ -1 +1,5 @@
 
 
 
 
1
  /Non_Bids_Dataset
 
1
+ .env
2
+ __pycache__/
3
+ .venv
4
+ testing_structure.xml
5
  /Non_Bids_Dataset
cli.py CHANGED
@@ -7,11 +7,35 @@ from typing import List, Optional
7
  from agent import BIDSifierAgent
8
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def _read_optional(path: Optional[str]) -> Optional[str]:
11
  if not path:
12
  return None
13
  if not os.path.isfile(path):
14
  raise FileNotFoundError(f"File not found: {path}")
 
 
 
15
  with open(path, "r", encoding="utf-8", errors="ignore") as f:
16
  return f.read()
17
 
 
7
  from agent import BIDSifierAgent
8
 
9
 
10
+ def _read_pdf(path: str) -> str:
11
+ """Extract text from a PDF file using pypdf."""
12
+ try:
13
+ from pypdf import PdfReader
14
+ except ImportError as e:
15
+ raise RuntimeError(
16
+ "Reading PDFs requires the 'pypdf' package. Install it with: pip install pypdf"
17
+ ) from e
18
+ text_parts: List[str] = []
19
+ with open(path, "rb") as f:
20
+ reader = PdfReader(f)
21
+ for i, page in enumerate(reader.pages):
22
+ try:
23
+ text = page.extract_text() or ""
24
+ except Exception:
25
+ text = ""
26
+ if text.strip():
27
+ # Add lightweight page markers to help the LLM
28
+ text_parts.append(f"\n\n=== Page {i+1} ===\n{text.strip()}")
29
+ return "\n".join(text_parts).strip()
30
+
31
  def _read_optional(path: Optional[str]) -> Optional[str]:
32
  if not path:
33
  return None
34
  if not os.path.isfile(path):
35
  raise FileNotFoundError(f"File not found: {path}")
36
+ ext = os.path.splitext(path)[1].lower()
37
+ if ext == ".pdf":
38
+ return _read_pdf(path)
39
  with open(path, "r", encoding="utf-8", errors="ignore") as f:
40
  return f.read()
41