model111 / larm /data /utils /search_utils.py
LCZZZZ's picture
Upload MemGen code and data
e34b94f verified
from typing import Optional, Union
from langchain.docstore.document import Document
import wikipedia
class LangChainWiki:
def __init__(self) -> None:
self.document: Optional[Document] = None
self.lookup_str = ""
self.lookup_index = 0
def search(self, search: str) -> Union[str, Document]:
def _try_search(term: str) -> Union[str, Document]:
try:
page_content = wikipedia.page(search).content
url = wikipedia.page(search).url
result: Union[str, Document] = Document( page_content=page_content, metadata={"page": url} )
return result
except wikipedia.PageError:
return f"Could not find [{term}]. Similar: {wikipedia.search(term)}"
except wikipedia.DisambiguationError:
return f"Could not find [{term}]. Similar: {wikipedia.search(term)}"
except Exception:
return f"Could not find [{term}]. Similar: {wikipedia.search(term)}"
result = _try_search(search)
if isinstance(result, str) and "Similar:" in result:
try:
similar = wikipedia.search(search)
if similar:
fallback = similar[0]
print(f"[INFO] Falling back to similar term: {fallback}")
result = _try_search(fallback)
except Exception as e:
print(f"[ERROR] Could not fetch similar terms: {e}")
if isinstance(result, Document):
self.document = result
return self._sumary
else:
self.document = None
return result
def lookup(self, term: str):
if self.document is None:
raise ValueError("Cannot lookup without a successful search first")
if term.lower() != self.lookup_str:
self.lookup_str = term.lower()
self.lookup_index = 0
else:
self.lookup_index += 1
lookups = [p for p in self._paragraphs if self.lookup_str in p.lower()]
if len(lookups) == 0:
return "No Results"
elif self.lookup_index >= len(lookups):
return "No More Results"
else:
result_prefix = f"(Result {self.lookup_index + 1}/{len(lookups)})"
return f"{result_prefix} {lookups[self.lookup_index]}"
@property
def _sumary(self) -> str:
return self._paragraphs[0]
@property
def _paragraphs(self) -> list[str]:
if self.document is None:
raise ValueError("Cannot get paragraphs without a document")
return self.document.page_content.split("\n\n")