Spaces:

iohanngrig
/

textSummary

Build error

App Files Files Community

iohanngrig commited on Jan 27, 2024

Commit

f3175f2

verified ·

1 Parent(s): 1a3e29b

Delete pages/utils

Browse files

Files changed (1) hide show

pages/utils/process_data.py +0 -72

pages/utils/process_data.py DELETED Viewed

@@ -1,72 +0,0 @@
-from PyPDF2 import PdfReader
-from langchain.text_splitter import CharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_community.vectorstores import FAISS
-CHUNK_SIZE = 1024
-MAX_CHUNKS = 500
-def split_text_into_chunks(text, chunk_size=CHUNK_SIZE):
-    """
-    Splits text into smaller chunks.
-    Args:
-        text (str): Text to be split.
-        chunk_size (int, optional): Size of each chunk. Defaults to 4,000.
-    Returns:
-        list[str]: List of text chunks.
-    """
-    chunks = []
-    for i in range(0, len(text), chunk_size):
-        chunks.append(text[i : i + chunk_size])
-    return chunks
-def generate_chunks(inp_str, max_chunks=MAX_CHUNKS):
-    """ Chunk text into smaller pieces."""
-    inp_str = inp_str.replace('.', '.<eos>')
-    inp_str = inp_str.replace('?', '?<eos>')
-    inp_str = inp_str.replace('!', '!<eos>')
-    sentences = inp_str.split('<eos>')
-    current_chunk = 0
-    chunks = []
-    for sentence in sentences:
-        if len(chunks) == current_chunk + 1:
-            if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunks:
-                chunks[current_chunk].extend(sentence.split(' '))
-            else:
-                current_chunk += 1
-                chunks.append(sentence.split(' '))
-        else:
-            chunks.append(sentence.split(' '))
-    return [' '.join(chunk) for chunk in chunks]
-def pdf_to_text(pdf_path):
-    """
-    Converts a PDF file to text.
-    Args:
-        pdf_path (str): Path to the PDF file.
-    Returns:
-        str: Extracted text from the PDF file.
-    """
-    reader = PdfReader(pdf_path)
-    extracted_texts = [page.extract_text() for page in reader.pages]
-    return " ".join(extracted_texts).replace("\n", " ")
-def process_text(text):
-    """ Split the text into chunks using Langchain's CharacterTextSplitter """
-    text_splitter = CharacterTextSplitter(
-        separator="\n",
-        chunk_size=CHUNK_SIZE,
-        chunk_overlap=200,
-        length_function=len
-    )
-    chunks = text_splitter.split_text(text)
-    # Convert the chunks of text into embeddings to form a knowledge base
-    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
-    knowledgeBase = FAISS.from_texts(chunks, embeddings)
-    return knowledgeBase