Spaces:
Runtime error
Runtime error
accidentally removed extract_text_fromo_json
Browse files
RAG.py
CHANGED
|
@@ -14,6 +14,7 @@ import requests
|
|
| 14 |
from typing import Dict, Any, Optional, List, Tuple
|
| 15 |
import logging
|
| 16 |
import concurrent.futures
|
|
|
|
| 17 |
|
| 18 |
def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 100) -> Tuple[List[Document], List[float]]:
|
| 19 |
start = time.time()
|
|
@@ -47,6 +48,22 @@ def safe_get_json(url: str) -> Optional[Dict]:
|
|
| 47 |
logging.error(f"Error fetching from {url}: {str(e)}")
|
| 48 |
return None
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
def process_single_document(doc: Document) -> Optional[Document]:
|
| 51 |
"""Process a single document by fetching and extracting metadata."""
|
| 52 |
if not doc.metadata.get('source'):
|
|
@@ -68,7 +85,7 @@ def process_single_document(doc: Document) -> Optional[Document]:
|
|
| 68 |
)
|
| 69 |
return None
|
| 70 |
|
| 71 |
-
def rerank(documents: List[Document], query: str, max_workers: int =
|
| 72 |
"""Ingest more metadata and rerank documents using BM25 with parallel processing."""
|
| 73 |
start = time.time()
|
| 74 |
if not documents:
|
|
@@ -103,6 +120,7 @@ def rerank(documents: List[Document], query: str, max_workers: int = 2) -> List[
|
|
| 103 |
logging.info(f"Finished reranking: {time.time()-start}")
|
| 104 |
return full_docs
|
| 105 |
|
|
|
|
| 106 |
def parse_xml_and_query(query:str,xml_string:str) -> str:
|
| 107 |
"""parse xml and return rephrased query"""
|
| 108 |
if not xml_string:
|
|
|
|
| 14 |
from typing import Dict, Any, Optional, List, Tuple
|
| 15 |
import logging
|
| 16 |
import concurrent.futures
|
| 17 |
+
import json
|
| 18 |
|
| 19 |
def retrieve(query: str,vectorstore:PineconeVectorStore, k: int = 100) -> Tuple[List[Document], List[float]]:
|
| 20 |
start = time.time()
|
|
|
|
| 48 |
logging.error(f"Error fetching from {url}: {str(e)}")
|
| 49 |
return None
|
| 50 |
|
| 51 |
+
def extract_text_from_json(json_data: Dict) -> str:
|
| 52 |
+
"""Extract text content from JSON response."""
|
| 53 |
+
if not json_data:
|
| 54 |
+
return ""
|
| 55 |
+
|
| 56 |
+
text_parts = []
|
| 57 |
+
|
| 58 |
+
# Handle direct text fields
|
| 59 |
+
text_fields = ["title_info_primary_tsi","abstract_tsi","subject_geographic_sim","genre_basic_ssim","genre_specific_ssim","date_tsim"]
|
| 60 |
+
for field in text_fields:
|
| 61 |
+
if field in json_data['data']['attributes'] and json_data['data']['attributes'][field]:
|
| 62 |
+
# print(json_data[field])
|
| 63 |
+
text_parts.append(str(json_data['data']['attributes'][field]))
|
| 64 |
+
|
| 65 |
+
return " ".join(text_parts) if text_parts else "No content available"
|
| 66 |
+
|
| 67 |
def process_single_document(doc: Document) -> Optional[Document]:
|
| 68 |
"""Process a single document by fetching and extracting metadata."""
|
| 69 |
if not doc.metadata.get('source'):
|
|
|
|
| 85 |
)
|
| 86 |
return None
|
| 87 |
|
| 88 |
+
def rerank(documents: List[Document], query: str, max_workers: int = 3) -> List[Document]:
|
| 89 |
"""Ingest more metadata and rerank documents using BM25 with parallel processing."""
|
| 90 |
start = time.time()
|
| 91 |
if not documents:
|
|
|
|
| 120 |
logging.info(f"Finished reranking: {time.time()-start}")
|
| 121 |
return full_docs
|
| 122 |
|
| 123 |
+
|
| 124 |
def parse_xml_and_query(query:str,xml_string:str) -> str:
|
| 125 |
"""parse xml and return rephrased query"""
|
| 126 |
if not xml_string:
|