sadegh803211 commited on
Commit
f009aa1
·
verified ·
1 Parent(s): a5c960f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -5
app.py CHANGED
@@ -62,10 +62,26 @@ class Utils:
62
 
63
  @staticmethod
64
  def _iter_json_records(json_path: str) -> Iterable[Dict[str, Any]]:
 
 
65
  with open(json_path, "r", encoding="utf-8") as f:
66
- for line in f:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  try:
68
- yield json.loads(line)
69
  except json.JSONDecodeError:
70
  continue
71
 
@@ -73,6 +89,10 @@ class Utils:
73
  def _collect_pairs(root: str) -> pd.DataFrame:
74
  rows = []
75
  json_files = glob(os.path.join(root, "**/*.json"), recursive=True)
 
 
 
 
76
  for jp in json_files:
77
  base_dir = os.path.dirname(jp)
78
  for rec in Utils._iter_json_records(jp):
@@ -87,6 +107,9 @@ class Utils:
87
  def _build_docstore(df: pd.DataFrame) -> pd.DataFrame:
88
  def _mk_id(row_text):
89
  return hashlib.sha1(row_text.encode("utf-8")).hexdigest()[:16]
 
 
 
90
  df['id'] = df['text'].apply(_mk_id)
91
  return df.rename(columns={'text': 'passage_text'})
92
 
@@ -96,15 +119,26 @@ class Utils:
96
  os.makedirs(os.path.dirname(out_docstore), exist_ok=True)
97
  df = Utils._collect_pairs(root)
98
  print(f"Found {len(df)} total passages.")
99
- df.drop_duplicates(subset=['text'], keep='first', inplace=True)
100
- print(f"Found {len(df)} unique passages after deduplication.")
101
- doc = Utils._build_docstore(df)
 
 
 
 
 
 
102
  doc.to_parquet(out_docstore, index=False)
103
  print(f"Docstore saved to {out_docstore}.")
104
  return doc
105
 
106
  def build_faiss_index(encoder, docstore, index_path, text_col="passage_text"):
107
  print(f"Building FAISS index: {os.path.basename(index_path)}")
 
 
 
 
 
108
  texts = docstore[text_col].astype(str).tolist()
109
  if hasattr(encoder, 'encode_numpy'):
110
  vecs = encoder.encode_numpy(texts)
@@ -142,3 +176,4 @@ def main():
142
 
143
  if __name__ == "__main__":
144
  main()
 
 
62
 
63
  @staticmethod
64
  def _iter_json_records(json_path: str) -> Iterable[Dict[str, Any]]:
65
+ # This more robust version can handle both single multi-line JSON objects
66
+ # and line-delimited JSON.
67
  with open(json_path, "r", encoding="utf-8") as f:
68
+ txt = f.read().strip()
69
+ if not txt: return
70
+ try:
71
+ # Try to parse the whole file as a single JSON object (list or dict)
72
+ obj = json.loads(txt)
73
+ if isinstance(obj, dict):
74
+ yield obj
75
+ return
76
+ for it in obj if isinstance(obj, list) else []:
77
+ if isinstance(it, dict): yield it
78
+ return
79
+ except json.JSONDecodeError:
80
+ # If that fails, fall back to parsing line by line
81
+ for line in txt.splitlines():
82
+ if not (line := line.strip()): continue
83
  try:
84
+ if isinstance((obj := json.loads(line)), dict): yield obj
85
  except json.JSONDecodeError:
86
  continue
87
 
 
89
  def _collect_pairs(root: str) -> pd.DataFrame:
90
  rows = []
91
  json_files = glob(os.path.join(root, "**/*.json"), recursive=True)
92
+ if not json_files:
93
+ print(f"Warning: No JSON files found in {root}. Please check the path.")
94
+ return pd.DataFrame(rows)
95
+
96
  for jp in json_files:
97
  base_dir = os.path.dirname(jp)
98
  for rec in Utils._iter_json_records(jp):
 
107
  def _build_docstore(df: pd.DataFrame) -> pd.DataFrame:
108
  def _mk_id(row_text):
109
  return hashlib.sha1(row_text.encode("utf-8")).hexdigest()[:16]
110
+ # Check if the dataframe is empty before proceeding
111
+ if 'text' not in df.columns:
112
+ return pd.DataFrame(columns=['id', 'passage_text', 'title']) # Return empty docstore
113
  df['id'] = df['text'].apply(_mk_id)
114
  return df.rename(columns={'text': 'passage_text'})
115
 
 
119
  os.makedirs(os.path.dirname(out_docstore), exist_ok=True)
120
  df = Utils._collect_pairs(root)
121
  print(f"Found {len(df)} total passages.")
122
+
123
+ if df.empty:
124
+ print("Warning: No valid data found to process. The docstore will be empty.")
125
+ doc = Utils._build_docstore(df)
126
+ else:
127
+ df.drop_duplicates(subset=['text'], keep='first', inplace=True)
128
+ print(f"Found {len(df)} unique passages after deduplication.")
129
+ doc = Utils._build_docstore(df)
130
+
131
  doc.to_parquet(out_docstore, index=False)
132
  print(f"Docstore saved to {out_docstore}.")
133
  return doc
134
 
135
  def build_faiss_index(encoder, docstore, index_path, text_col="passage_text"):
136
  print(f"Building FAISS index: {os.path.basename(index_path)}")
137
+ # Check if docstore is empty
138
+ if docstore.empty:
139
+ print("Docstore is empty. Skipping FAISS index creation.")
140
+ return
141
+
142
  texts = docstore[text_col].astype(str).tolist()
143
  if hasattr(encoder, 'encode_numpy'):
144
  vecs = encoder.encode_numpy(texts)
 
176
 
177
  if __name__ == "__main__":
178
  main()
179
+