Spaces:

yiqing111
/

AskMyNotes_LLM_RAG

No application file

App Files Files Community

yiqing111 commited on Mar 5, 2025

Commit

8255e91

verified ·

1 Parent(s): b061aa3

Upload 7 files

Browse files

Files changed (7) hide show

script/chunk.py +183 -0
script/embedding.py +5 -0
script/llm.py +27 -0
script/parse.py +145 -0
script/pipeline.py +19 -0
script/streamlit_app.py +75 -0
script/vector.py +53 -0

script/chunk.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import os
+import re
+from typing import List, Dict
+from tqdm import tqdm
+class SimpleTextChunker:
+    def __init__(self,
+                 chunk_size: int = 200,
+                 chunk_overlap: int = 20,
+                 recursive: bool = False,
+                 max_recursion_depth: int = 3):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.recursive = recursive
+        self.max_recursion_depth = max_recursion_depth
+    def is_mainly_chinese(self, text: str) -> bool:
+        """Check if text is primarily Chinese"""
+        if not text:
+            return False
+        chinese_chars = sum(1 for char in text if '\u4e00' <= char <= '\u9fff')
+        return chinese_chars / len(text) > 0.5
+    def simple_chunk_with_overlap(self, text: str, source: str) -> List[Dict]:
+        chunks = []
+        # Check if we should try to split on paragraph boundaries
+        paragraphs = []
+        if '\n\n' in text:
+            # Split by double newlines to get paragraphs
+            paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
+        # If we have meaningful paragraphs, use them as base units
+        if paragraphs and len(paragraphs) > 1 and max(len(p) for p in paragraphs) < self.chunk_size:
+            current_chunk = []
+            current_size = 0
+            for para in paragraphs:
+                para_size = len(para)
+                # If adding this paragraph would exceed the chunk size and we already have content
+                if current_size + para_size > self.chunk_size and current_chunk:
+                    # Create a chunk from what we have so far
+                    chunk_text = '\n\n'.join(current_chunk)
+                    chunks.append({
+                        "source": source,
+                        "content": chunk_text,
+                        "chunk_index": len(chunks),
+                        "is_chinese": self.is_mainly_chinese(chunk_text)
+                    })
+                    # Calculate how many paragraphs to keep for overlap
+                    overlap_size = 0
+                    overlap_paras = []
+                    for p in reversed(current_chunk):
+                        if overlap_size + len(p) <= self.chunk_overlap:
+                            overlap_paras.insert(0, p)
+                            overlap_size += len(p)
+                        else:
+                            break
+                    # Start the next chunk with the overlap paragraphs
+                    current_chunk = overlap_paras
+                    current_size = overlap_size
+                # Add paragraph to current chunk
+                current_chunk.append(para)
+                current_size += para_size
+            # Add the last chunk if there's anything left
+            if current_chunk:
+                chunk_text = '\n\n'.join(current_chunk)
+                chunks.append({
+                    "source": source,
+                    "content": chunk_text,
+                    "chunk_index": len(chunks),
+                    "is_chinese": self.is_mainly_chinese(chunk_text)
+                })
+        else:
+            # Fall back to character-based chunking
+            for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
+                chunk_start = i
+                chunk_end = min(i + self.chunk_size, len(text))
+                if chunk_end <= chunk_start:
+                    break
+                chunk_text = text[chunk_start:chunk_end]
+                chunks.append({
+                    "source": source,
+                    "content": chunk_text,
+                    "chunk_index": len(chunks),
+                    "is_chinese": self.is_mainly_chinese(chunk_text)
+                })
+        return chunks
+    def recursive_chunk(self, text: str, source: str, depth: int = 0) -> List[Dict]:
+        if len(text) <= self.chunk_size or depth >= self.max_recursion_depth:
+            return [{
+                "source": source,
+                "content": text,
+                "chunk_index": 0,
+                "recursion_depth": depth,
+                "is_chinese": self.is_mainly_chinese(text)
+            }]
+        # First level
+        if depth == 0 and '\n#' in text:  # Markdown header format
+            sections = re.split(r'\n(#+ )', text)
+            if len(sections) > 1:
+                # Recombine the headers with their content
+                combined_sections = []
+                for i in range(1, len(sections), 2):
+                    if i+1 < len(sections):
+                        combined_sections.append(sections[i] + sections[i+1])
+                    else:
+                        combined_sections.append(sections[i])
+                # Recursively process each section
+                all_chunks = []
+                for i, section in enumerate(combined_sections):
+                    section_chunks = self.recursive_chunk(section, source, depth + 1)
+                    # Update chunk indices
+                    for j, chunk in enumerate(section_chunks):
+                        chunk["chunk_index"] = len(all_chunks) + j
+                        chunk["section_index"] = i
+                    all_chunks.extend(section_chunks)
+                return all_chunks
+        # If no natural sections or not at top level, use overlap chunking
+        return self.simple_chunk_with_overlap(text, source)
+    def process_document(self, document: Dict) -> List[Dict]:
+        if not document.get("text") or not document.get("success", False):
+            print(f"Skipping document {document.get('filename', 'unknown')}: No text or extraction failed")
+            return []
+        text = document["text"]
+        source = document.get("filename", "unknown")
+        if self.recursive:
+            chunks = self.recursive_chunk(text, source)
+        else:
+            chunks = self.simple_chunk_with_overlap(text, source)
+        # Add document metadata to each chunk
+        for chunk in chunks:
+            chunk["document_pages"] = document.get("pages", 0)
+            chunk["total_chunks"] = len(chunks)
+        return chunks
+    def process_documents(self, documents: List[Dict]) -> List[Dict]:
+        all_chunks = []
+        for doc in tqdm(documents, desc="Chunking documents"):
+            doc_chunks = self.process_document(doc)
+            all_chunks.extend(doc_chunks)
+        print(f"Created {len(all_chunks)} chunks from {len(documents)} documents")
+        return all_chunks
+    def save_chunks(self, chunks: List[Dict], output_path: str):
+        with open(output_path, 'w', encoding='utf-8') as f:
+            for i, chunk in enumerate(chunks):
+                f.write(f"Chunk {i+1}/{len(chunks)}\n")
+                f.write(f"Source: {chunk['source']}\n")
+                f.write(f"Index: {chunk['chunk_index']}/{chunk['total_chunks']}\n")
+                if "recursion_depth" in chunk:
+                    f.write(f"Depth: {chunk['recursion_depth']}\n")
+                f.write(f"Chinese: {chunk.get('is_chinese', False)}\n")
+                f.write("Content:\n")
+                f.write(chunk['content'])
+                f.write("\n" + "-" * 80 + "\n\n")
+        print(f"Saved {len(chunks)} chunks to {output_path}")

script/embedding.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from sentence_transformers import SentenceTransformer
+embedding_model = SentenceTransformer('intfloat/multilingual-e5-large')
+def get_embedding(text):
+    return embedding_model.encode(text).tolist()

script/llm.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from openai import OpenAI
+import os
+from dotenv import load_dotenv
+load_dotenv()
+client = OpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")
+def ask_llm(question, context):
+    prompt = f"""
+Please answer the following question based on the provided notes:
+Notes:
+{context}
+Question:
+{question}
+"""
+    response = client.chat.completions.create(
+        model="deepseek-chat",
+        messages=[
+        {"role": "system", "content": "You are a helpful assistant who answers based on the given notes."},
+        {"role": "user", "content": f"Notes:\n{context}\n\nQuestion: {question}"}
+    ]
+    )
+    return response.choices[0].message.content

script/parse.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+import glob
+from typing import List, Dict
+import fitz
+import re
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+class PDFTextExtractor:
+    def __init__(self, input_dir: str, output_dir: str = None):
+        self.input_dir = input_dir
+        self.output_dir = output_dir or os.path.join(input_dir, "extracted_text")
+        # Ensure output directory exists
+        os.makedirs(self.output_dir, exist_ok=True)
+    def get_pdf_files(self) -> List[str]:
+        pdf_files = glob.glob(os.path.join(self.input_dir, "*.pdf"))
+        pdf_files.extend(glob.glob(os.path.join(self.input_dir, "*.PDF")))
+        print(f"Found {len(pdf_files)} PDF files in directory {self.input_dir}")
+        return pdf_files
+    def extract_text_from_pdf(self, pdf_path: str) -> Dict:
+        filename = os.path.basename(pdf_path)
+        result = {
+            "filename": filename,
+            "path": pdf_path,
+            "success": False,
+            "text": "",
+            "pages": 0,
+            "error": None
+        }
+        try:
+            doc = fitz.open(pdf_path)
+            result["pages"] = len(doc)
+            full_text = ""
+            for page_num in range(len(doc)):
+                page = doc.load_page(page_num)
+                # Use "text" mode to extract plain text, ignoring tables and images
+                page_text = page.get_text("text")
+                full_text += page_text + "\n\n"  # Add line breaks to separate pages
+            # Clean the text
+            full_text = self.clean_text(full_text)
+            result["text"] = full_text
+            result["success"] = True
+            # Close the document
+            doc.close()
+        except Exception as e:
+            error_msg = f"Error extracting {filename}: {str(e)}"
+            print(error_msg)
+            result["error"] = error_msg
+        return result
+    def clean_text(self, text: str) -> str:
+        # Remove consecutive empty lines
+        text = re.sub(r'\n{3,}', '\n\n', text)
+        # Remove unprintable characters, but keep Chinese, English, numbers and basic punctuation
+        text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9.,!?;:()\'"，。！？、；：《》【】「」\s]', '', text)
+        # Merge multiple spaces
+        text = re.sub(r'\s+', ' ', text)
+        # Fix spacing issues between Chinese and English
+        text = re.sub(r'([a-zA-Z])([\u4e00-\u9fa5])', r'\1 \2', text)
+        text = re.sub(r'([\u4e00-\u9fa5])([a-zA-Z])', r'\1 \2', text)
+        return text.strip()
+    def save_extracted_text(self, extraction_result: Dict) -> None:
+        """Save the extracted text to a file"""
+        if not extraction_result["success"]:
+            return
+        # Create output filename based on original filename
+        base_name = os.path.splitext(extraction_result["filename"])[0]
+        output_path = os.path.join(self.output_dir, f"{base_name}.txt")
+        # Write to text file
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(extraction_result["text"])
+        print(f"Saved extracted text to {output_path}")
+    def process_single_pdf(self, pdf_path: str) -> Dict:
+        """Process a single PDF file and save results"""
+        extraction_result = self.extract_text_from_pdf(pdf_path)
+        if extraction_result["success"]:
+            self.save_extracted_text(extraction_result)
+            print(f"Successfully processed {extraction_result['filename']} ({extraction_result['pages']} pages)")
+        else:
+            print(f"Failed to process {extraction_result['filename']}: {extraction_result['error']}")
+        return extraction_result
+    def extract_all_pdfs(self, max_workers: int = 4) -> List[Dict]:
+        pdf_files = self.get_pdf_files()
+        results = []
+        if not pdf_files:
+            print("No PDF files found")
+            return results
+        # Use thread pool for parallel processing
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Use tqdm to create a progress bar
+            for result in tqdm(executor.map(self.process_single_pdf, pdf_files),
+                              total=len(pdf_files),
+                              desc="Processing PDF files"):
+                results.append(result)
+        # Count successful and failed processes
+        success_count = sum(1 for r in results if r["success"])
+        fail_count = len(results) - success_count
+        print(f"PDF processing completed: {success_count} successful, {fail_count} failed")
+        return results
+# Usage example
+if __name__ == "__main__":
+    # Configure input and output directories
+    INPUT_DIR = "../data"
+    OUTPUT_DIR = "../data"
+    # Create extractor instance
+    extractor = PDFTextExtractor(INPUT_DIR, OUTPUT_DIR)
+    # Execute extraction
+    results = extractor.extract_all_pdfs(max_workers=4)  # Use 4 threads for parallel processing
+    # Print summary
+    print(f"\nProcessed {len(results)} PDF files in total")
+    print(f"Successful: {sum(1 for r in results if r['success'])}")
+    print(f"Failed: {sum(1 for r in results if not r['success'])}")

script/pipeline.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from embedding import get_embedding
+from vector import VectorStore
+from chunk import SimpleTextChunker
+from parse import PDFTextExtractor
+def build_knowledge_base(pdf_folder):
+    extractor = PDFTextExtractor(pdf_folder)
+    documents = extractor.extract_all_pdfs()
+    chunker = SimpleTextChunker()
+    all_chunks = chunker.process_documents(documents)
+    store = VectorStore()
+    embeddings = [get_embedding(chunk["content"]) for chunk in all_chunks]
+    store.add(embeddings, all_chunks)
+    print(f"✅ Knowledge base built with {len(all_chunks)} chunks.")
+    return store

script/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import os
+import streamlit as st
+from dotenv import load_dotenv
+from embedding import get_embedding
+from vector import VectorStore
+from parse import PDFTextExtractor
+from chunk import SimpleTextChunker
+from llm import ask_llm
+# Load environment variables
+load_dotenv()
+# Initialize VectorStore
+if "store" not in st.session_state:
+    st.session_state["store"] = VectorStore()
+st.title("📚 RAG Note Assistant - Upload & Ask")
+PDF_FOLDER = "pdf_folder"
+os.makedirs(PDF_FOLDER, exist_ok=True)
+# upload PDF files
+uploaded_files = st.file_uploader("Upload new PDF documents", accept_multiple_files=True, type=["pdf"])
+if uploaded_files:
+    for file in uploaded_files:
+        file_path = os.path.join(PDF_FOLDER, file.name)
+        with open(file_path, "wb") as f:
+            f.write(file.getbuffer())
+        # Extract text from the uploaded PDF
+        extractor = PDFTextExtractor(PDF_FOLDER)
+        document = extractor.extract_text_from_pdf(file_path)
+        # Chunk the extracted text
+        chunker = SimpleTextChunker(chunk_size=500, chunk_overlap=100)
+        chunks = chunker.process_document(document)
+        # Generate embeddings and upsert into Pinecone
+        embeddings = [get_embedding(chunk["content"]) for chunk in chunks]
+        st.session_state["store"].add(embeddings, chunks)
+        st.success(f" '{file.name}' has been successfully added to the knowledge base!")
+# ask question
+question = st.text_input("Enter your question")
+if st.button("Submit"):
+    if not question.strip():
+        st.warning(" Please enter a valid question.")
+    else:
+        # Generate query embedding
+        query_embedding = get_embedding(question)
+        # Perform similarity search
+        relevant_chunks = st.session_state["store"].search(query_embedding)
+        if not relevant_chunks:
+            st.warning(" No relevant content found in the knowledge base. Please upload related documents first.")
+        else:
+            # Combine retrieved chunks into context
+            context = "\n".join([chunk["text"] for chunk in relevant_chunks])
+            # Ask the LLM for the answer
+            with st.spinner('AI is thinking...'):
+                answer = ask_llm(question, context)
+            st.markdown("### 🤖 AI Answer")
+            st.write(answer)
+            st.markdown("### 📖 Reference Chunks")
+            st.write(context)

script/vector.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+from pinecone import Pinecone, ServerlessSpec
+from dotenv import load_dotenv
+import numpy as np
+load_dotenv()
+class VectorStore:
+    def __init__(self):
+        api_key = os.getenv("PINECONE_API_KEY")
+        index_name = os.getenv("PINECONE_INDEX_NAME")
+        # connect to Pinecone
+        self.pc = Pinecone(api_key=api_key)
+        if index_name not in self.pc.list_indexes().names():
+            self.pc.create_index(
+                name=index_name,
+                dimension=1024,
+                metric="cosine",
+                spec=ServerlessSpec(
+                    cloud='aws',
+                    region='us-east-1'
+                )
+            )
+            print(f" Created new Pinecone index: {index_name}")
+        else:
+            print(f"Reusing existing Pinecone index: {index_name}")
+        self.index = self.pc.Index(index_name)
+    def add(self, embeddings, chunks):
+        vectors = []
+        for idx, emb in enumerate(embeddings):
+            vectors.append((
+                f"chunk-{idx}",
+                emb,
+                {"text": chunks[idx]["content"], "source": chunks[idx]["source"], "position": chunks[idx]["chunk_index"]}
+            ))
+        self.index.upsert(vectors)
+    def search(self, query_embedding, top_k=5):
+        query_embedding = query_embedding
+        results = self.index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
+        return [
+            {
+                "text": item["metadata"]["text"],
+                "source": item["metadata"]["source"],
+                "position": item["metadata"]["position"],
+                "score": item["score"]
+            }
+            for item in results["matches"]
+        ]