Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -62,10 +62,26 @@ class Utils:
|
|
| 62 |
|
| 63 |
@staticmethod
|
| 64 |
def _iter_json_records(json_path: str) -> Iterable[Dict[str, Any]]:
|
|
|
|
|
|
|
| 65 |
with open(json_path, "r", encoding="utf-8") as f:
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
try:
|
| 68 |
-
|
| 69 |
except json.JSONDecodeError:
|
| 70 |
continue
|
| 71 |
|
|
@@ -73,6 +89,10 @@ class Utils:
|
|
| 73 |
def _collect_pairs(root: str) -> pd.DataFrame:
|
| 74 |
rows = []
|
| 75 |
json_files = glob(os.path.join(root, "**/*.json"), recursive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
for jp in json_files:
|
| 77 |
base_dir = os.path.dirname(jp)
|
| 78 |
for rec in Utils._iter_json_records(jp):
|
|
@@ -87,6 +107,9 @@ class Utils:
|
|
| 87 |
def _build_docstore(df: pd.DataFrame) -> pd.DataFrame:
|
| 88 |
def _mk_id(row_text):
|
| 89 |
return hashlib.sha1(row_text.encode("utf-8")).hexdigest()[:16]
|
|
|
|
|
|
|
|
|
|
| 90 |
df['id'] = df['text'].apply(_mk_id)
|
| 91 |
return df.rename(columns={'text': 'passage_text'})
|
| 92 |
|
|
@@ -96,15 +119,26 @@ class Utils:
|
|
| 96 |
os.makedirs(os.path.dirname(out_docstore), exist_ok=True)
|
| 97 |
df = Utils._collect_pairs(root)
|
| 98 |
print(f"Found {len(df)} total passages.")
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
doc.to_parquet(out_docstore, index=False)
|
| 103 |
print(f"Docstore saved to {out_docstore}.")
|
| 104 |
return doc
|
| 105 |
|
| 106 |
def build_faiss_index(encoder, docstore, index_path, text_col="passage_text"):
|
| 107 |
print(f"Building FAISS index: {os.path.basename(index_path)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
texts = docstore[text_col].astype(str).tolist()
|
| 109 |
if hasattr(encoder, 'encode_numpy'):
|
| 110 |
vecs = encoder.encode_numpy(texts)
|
|
@@ -142,3 +176,4 @@ def main():
|
|
| 142 |
|
| 143 |
if __name__ == "__main__":
|
| 144 |
main()
|
|
|
|
|
|
| 62 |
|
| 63 |
@staticmethod
|
| 64 |
def _iter_json_records(json_path: str) -> Iterable[Dict[str, Any]]:
|
| 65 |
+
# This more robust version can handle both single multi-line JSON objects
|
| 66 |
+
# and line-delimited JSON.
|
| 67 |
with open(json_path, "r", encoding="utf-8") as f:
|
| 68 |
+
txt = f.read().strip()
|
| 69 |
+
if not txt: return
|
| 70 |
+
try:
|
| 71 |
+
# Try to parse the whole file as a single JSON object (list or dict)
|
| 72 |
+
obj = json.loads(txt)
|
| 73 |
+
if isinstance(obj, dict):
|
| 74 |
+
yield obj
|
| 75 |
+
return
|
| 76 |
+
for it in obj if isinstance(obj, list) else []:
|
| 77 |
+
if isinstance(it, dict): yield it
|
| 78 |
+
return
|
| 79 |
+
except json.JSONDecodeError:
|
| 80 |
+
# If that fails, fall back to parsing line by line
|
| 81 |
+
for line in txt.splitlines():
|
| 82 |
+
if not (line := line.strip()): continue
|
| 83 |
try:
|
| 84 |
+
if isinstance((obj := json.loads(line)), dict): yield obj
|
| 85 |
except json.JSONDecodeError:
|
| 86 |
continue
|
| 87 |
|
|
|
|
| 89 |
def _collect_pairs(root: str) -> pd.DataFrame:
|
| 90 |
rows = []
|
| 91 |
json_files = glob(os.path.join(root, "**/*.json"), recursive=True)
|
| 92 |
+
if not json_files:
|
| 93 |
+
print(f"Warning: No JSON files found in {root}. Please check the path.")
|
| 94 |
+
return pd.DataFrame(rows)
|
| 95 |
+
|
| 96 |
for jp in json_files:
|
| 97 |
base_dir = os.path.dirname(jp)
|
| 98 |
for rec in Utils._iter_json_records(jp):
|
|
|
|
| 107 |
def _build_docstore(df: pd.DataFrame) -> pd.DataFrame:
|
| 108 |
def _mk_id(row_text):
|
| 109 |
return hashlib.sha1(row_text.encode("utf-8")).hexdigest()[:16]
|
| 110 |
+
# Check if the dataframe is empty before proceeding
|
| 111 |
+
if 'text' not in df.columns:
|
| 112 |
+
return pd.DataFrame(columns=['id', 'passage_text', 'title']) # Return empty docstore
|
| 113 |
df['id'] = df['text'].apply(_mk_id)
|
| 114 |
return df.rename(columns={'text': 'passage_text'})
|
| 115 |
|
|
|
|
| 119 |
os.makedirs(os.path.dirname(out_docstore), exist_ok=True)
|
| 120 |
df = Utils._collect_pairs(root)
|
| 121 |
print(f"Found {len(df)} total passages.")
|
| 122 |
+
|
| 123 |
+
if df.empty:
|
| 124 |
+
print("Warning: No valid data found to process. The docstore will be empty.")
|
| 125 |
+
doc = Utils._build_docstore(df)
|
| 126 |
+
else:
|
| 127 |
+
df.drop_duplicates(subset=['text'], keep='first', inplace=True)
|
| 128 |
+
print(f"Found {len(df)} unique passages after deduplication.")
|
| 129 |
+
doc = Utils._build_docstore(df)
|
| 130 |
+
|
| 131 |
doc.to_parquet(out_docstore, index=False)
|
| 132 |
print(f"Docstore saved to {out_docstore}.")
|
| 133 |
return doc
|
| 134 |
|
| 135 |
def build_faiss_index(encoder, docstore, index_path, text_col="passage_text"):
|
| 136 |
print(f"Building FAISS index: {os.path.basename(index_path)}")
|
| 137 |
+
# Check if docstore is empty
|
| 138 |
+
if docstore.empty:
|
| 139 |
+
print("Docstore is empty. Skipping FAISS index creation.")
|
| 140 |
+
return
|
| 141 |
+
|
| 142 |
texts = docstore[text_col].astype(str).tolist()
|
| 143 |
if hasattr(encoder, 'encode_numpy'):
|
| 144 |
vecs = encoder.encode_numpy(texts)
|
|
|
|
| 176 |
|
| 177 |
if __name__ == "__main__":
|
| 178 |
main()
|
| 179 |
+
|