Spaces:

parsi-ai-nlpclass
/

Persian-Food-RAG

Sleeping

App Files Files Community

sadegh803211 commited on Sep 19

Commit

f009aa1

verified ·

1 Parent(s): a5c960f

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -5

app.py CHANGED Viewed

@@ -62,10 +62,26 @@ class Utils:
     @staticmethod
     def _iter_json_records(json_path: str) -> Iterable[Dict[str, Any]]:
         with open(json_path, "r", encoding="utf-8") as f:
-            for line in f:
                 try:
-                    yield json.loads(line)
                 except json.JSONDecodeError:
                     continue
@@ -73,6 +89,10 @@ class Utils:
     def _collect_pairs(root: str) -> pd.DataFrame:
         rows = []
         json_files = glob(os.path.join(root, "**/*.json"), recursive=True)
         for jp in json_files:
             base_dir = os.path.dirname(jp)
             for rec in Utils._iter_json_records(jp):
@@ -87,6 +107,9 @@ class Utils:
     def _build_docstore(df: pd.DataFrame) -> pd.DataFrame:
         def _mk_id(row_text):
             return hashlib.sha1(row_text.encode("utf-8")).hexdigest()[:16]
         df['id'] = df['text'].apply(_mk_id)
         return df.rename(columns={'text': 'passage_text'})
@@ -96,15 +119,26 @@ class Utils:
         os.makedirs(os.path.dirname(out_docstore), exist_ok=True)
         df = Utils._collect_pairs(root)
         print(f"Found {len(df)} total passages.")
-        df.drop_duplicates(subset=['text'], keep='first', inplace=True)
-        print(f"Found {len(df)} unique passages after deduplication.")
-        doc = Utils._build_docstore(df)
         doc.to_parquet(out_docstore, index=False)
         print(f"Docstore saved to {out_docstore}.")
         return doc
 def build_faiss_index(encoder, docstore, index_path, text_col="passage_text"):
     print(f"Building FAISS index: {os.path.basename(index_path)}")
     texts = docstore[text_col].astype(str).tolist()
     if hasattr(encoder, 'encode_numpy'):
         vecs = encoder.encode_numpy(texts)
@@ -142,3 +176,4 @@ def main():
 if __name__ == "__main__":
     main()

     @staticmethod
     def _iter_json_records(json_path: str) -> Iterable[Dict[str, Any]]:
+        # This more robust version can handle both single multi-line JSON objects
+        # and line-delimited JSON.
         with open(json_path, "r", encoding="utf-8") as f:
+            txt = f.read().strip()
+        if not txt: return
+        try:
+            # Try to parse the whole file as a single JSON object (list or dict)
+            obj = json.loads(txt)
+            if isinstance(obj, dict):
+                yield obj
+                return
+            for it in obj if isinstance(obj, list) else []:
+                if isinstance(it, dict): yield it
+            return
+        except json.JSONDecodeError:
+            # If that fails, fall back to parsing line by line
+            for line in txt.splitlines():
+                if not (line := line.strip()): continue
                 try:
+                    if isinstance((obj := json.loads(line)), dict): yield obj
                 except json.JSONDecodeError:
                     continue
     def _collect_pairs(root: str) -> pd.DataFrame:
         rows = []
         json_files = glob(os.path.join(root, "**/*.json"), recursive=True)
+        if not json_files:
+            print(f"Warning: No JSON files found in {root}. Please check the path.")
+            return pd.DataFrame(rows)
         for jp in json_files:
             base_dir = os.path.dirname(jp)
             for rec in Utils._iter_json_records(jp):
     def _build_docstore(df: pd.DataFrame) -> pd.DataFrame:
         def _mk_id(row_text):
             return hashlib.sha1(row_text.encode("utf-8")).hexdigest()[:16]
+        # Check if the dataframe is empty before proceeding
+        if 'text' not in df.columns:
+             return pd.DataFrame(columns=['id', 'passage_text', 'title']) # Return empty docstore
         df['id'] = df['text'].apply(_mk_id)
         return df.rename(columns={'text': 'passage_text'})
         os.makedirs(os.path.dirname(out_docstore), exist_ok=True)
         df = Utils._collect_pairs(root)
         print(f"Found {len(df)} total passages.")
+        if df.empty:
+            print("Warning: No valid data found to process. The docstore will be empty.")
+            doc = Utils._build_docstore(df)
+        else:
+            df.drop_duplicates(subset=['text'], keep='first', inplace=True)
+            print(f"Found {len(df)} unique passages after deduplication.")
+            doc = Utils._build_docstore(df)
         doc.to_parquet(out_docstore, index=False)
         print(f"Docstore saved to {out_docstore}.")
         return doc
 def build_faiss_index(encoder, docstore, index_path, text_col="passage_text"):
     print(f"Building FAISS index: {os.path.basename(index_path)}")
+    # Check if docstore is empty
+    if docstore.empty:
+        print("Docstore is empty. Skipping FAISS index creation.")
+        return
     texts = docstore[text_col].astype(str).tolist()
     if hasattr(encoder, 'encode_numpy'):
         vecs = encoder.encode_numpy(texts)
 if __name__ == "__main__":
     main()