Spaces:
Runtime error
Runtime error
ffreemt commited on
Commit ·
21c3825
1
Parent(s): e882a86
Add test.epub to docs
Browse files- .gitignore +3 -0
- app.py +4 -2
- docs/test.epub +0 -0
- epub_loader.py +38 -0
.gitignore
CHANGED
|
@@ -1,3 +1,6 @@
|
|
| 1 |
.venv
|
| 2 |
db
|
| 3 |
dummy
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
.venv
|
| 2 |
db
|
| 3 |
dummy
|
| 4 |
+
.ENV
|
| 5 |
+
.env
|
| 6 |
+
__pycache__
|
app.py
CHANGED
|
@@ -289,12 +289,12 @@ def ingest(
|
|
| 289 |
]
|
| 290 |
|
| 291 |
|
|
|
|
| 292 |
# https://huggingface.co/TheBloke/vicuna-7B-1.1-HF
|
| 293 |
def gen_local_llm(model_id="TheBloke/vicuna-7B-1.1-HF"):
|
| 294 |
"""Gen a local llm.
|
| 295 |
|
| 296 |
localgpt run_localgpt
|
| 297 |
-
|
| 298 |
https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2
|
| 299 |
with torch.device(“cuda”):
|
| 300 |
model = AutoModelForCausalLM.from_pretrained(“gpt2-large”, torch_dtype=torch.float16)
|
|
@@ -354,7 +354,9 @@ def load_qa(device=None, model_name: str = "hkunlp/instructor-base"):
|
|
| 354 |
llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
|
| 355 |
|
| 356 |
qa = RetrievalQA.from_chain_type(
|
| 357 |
-
llm=llm, chain_type="stuff",
|
|
|
|
|
|
|
| 358 |
)
|
| 359 |
|
| 360 |
logger.info("Done qa")
|
|
|
|
| 289 |
]
|
| 290 |
|
| 291 |
|
| 292 |
+
# TheBloke/Wizard-Vicuna-7B-Uncensored-HF
|
| 293 |
# https://huggingface.co/TheBloke/vicuna-7B-1.1-HF
|
| 294 |
def gen_local_llm(model_id="TheBloke/vicuna-7B-1.1-HF"):
|
| 295 |
"""Gen a local llm.
|
| 296 |
|
| 297 |
localgpt run_localgpt
|
|
|
|
| 298 |
https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2
|
| 299 |
with torch.device(“cuda”):
|
| 300 |
model = AutoModelForCausalLM.from_pretrained(“gpt2-large”, torch_dtype=torch.float16)
|
|
|
|
| 354 |
llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
|
| 355 |
|
| 356 |
qa = RetrievalQA.from_chain_type(
|
| 357 |
+
llm=llm, chain_type="stuff",
|
| 358 |
+
retriever=retriever,
|
| 359 |
+
return_source_documents=True,
|
| 360 |
)
|
| 361 |
|
| 362 |
logger.info("Done qa")
|
docs/test.epub
ADDED
|
Binary file (261 kB). View file
|
|
|
epub_loader.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Loads an epub file into a list of documents."""
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import List, Union
|
| 5 |
+
|
| 6 |
+
from epub2txt import epub2txt
|
| 7 |
+
from langchain.docstore.document import Document
|
| 8 |
+
from langchain.document_loaders.base import BaseLoader
|
| 9 |
+
from loguru import logger
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class EpubLoader(BaseLoader):
|
| 14 |
+
"""Load an epub file into a list of documents.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
file_path: file path or url to epub
|
| 18 |
+
Returns:
|
| 19 |
+
self.load() -> list of Documents
|
| 20 |
+
"""
|
| 21 |
+
file_path: Union[str, Path]
|
| 22 |
+
|
| 23 |
+
def load(self) -> List[Document]:
|
| 24 |
+
"""Load data into document objects."""
|
| 25 |
+
try:
|
| 26 |
+
texts = epub2txt(self.file_path, outputlist=True)
|
| 27 |
+
ch_titles = epub2txt.content_titles
|
| 28 |
+
|
| 29 |
+
except Exception as exc:
|
| 30 |
+
logger.error(exc)
|
| 31 |
+
raise
|
| 32 |
+
|
| 33 |
+
docs = []
|
| 34 |
+
for title, text in zip(ch_titles, texts):
|
| 35 |
+
metadata = {"source": self.file_path, "ch.": title}
|
| 36 |
+
docs.append(Document(page_content=text, metadata=metadata))
|
| 37 |
+
|
| 38 |
+
return docs
|