Spaces:

random2222
/

trry

Sleeping

App Files Files Community

random2222 commited on Apr 11, 2025

Commit

62390c0

verified ·

1 Parent(s): d168db4

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -68

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import gradio as gr
 from langchain_community.vectorstores import FAISS
@@ -6,86 +5,55 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.document_loaders import PyMuPDFLoader
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.chains import RetrievalQA
-from langchain_community.llms import HuggingFaceHub  # Updated import path
-import zipfile
-# Rest of your existing code remains the same...
-# Extract PDFs from zip file
-def extract_pdfs_from_zip(zip_path="data.zip", extract_to="data"):
-    if not os.path.exists(zip_path):
-        raise FileNotFoundError(f"Zip file '{zip_path}' not found.")
-    if not os.path.exists(extract_to):
-        os.makedirs(extract_to)
-    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-        zip_ref.extractall(extract_to)
-def load_pdfs(directory="data"):
-    if not os.path.exists(directory):
-        raise FileNotFoundError(f"The directory '{directory}' does not exist.")
-    raw_documents = []
-    for filename in os.listdir(directory):
-        if filename.endswith(".pdf"):
-            loader = PyMuPDFLoader(os.path.join(directory, filename))
-            docs = loader.load()
-            raw_documents.extend(docs)
-    return raw_documents
-def split_documents(documents):
-    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-    return text_splitter.split_documents(documents)
-def initialize_qa_system():
-    print("📦 Extracting PDFs from zip...")
-    extract_pdfs_from_zip()
-    print("🔄 Loading PDFs...")
-    raw_docs = load_pdfs()
-    print(f"✅ Loaded {len(raw_docs)} raw documents.")
-    if len(raw_docs) == 0:
-        raise ValueError("No PDF documents found in the 'data' directory.")
-    print("🪓 Splitting documents into chunks...")
-    docs = split_documents(raw_docs)
-    print(f"✅ Split into {len(docs)} chunks.")
-    print("🧠 Generating embeddings...")
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    print("📦 Creating FAISS vector store...")
-    db = FAISS.from_documents(docs, embeddings)
-    print("✅ Vector store created successfully!")
-    print("🤖 Initializing LLM...")
     llm = HuggingFaceHub(
-        repo_id="google/flan-t5-xxl",
-        model_kwargs={"temperature": 0.5, "max_length": 512}
     )
-    qa = RetrievalQA.from_chain_type(
         llm=llm,
         chain_type="stuff",
-        retriever=db.as_retriever(search_kwargs={"k": 3})
     )
-    return qa
-# Initialize the QA system
-qa_system = initialize_qa_system()
-def chat_response(message, history):
-    response = qa_system({"query": message})
     return response["result"]
-# Create Gradio interface
-demo = gr.ChatInterface(
-    fn=chat_response,
-    title="PDF Knowledge Chatbot",
-    description="Ask questions about the content in your PDF documents"
-)
-if __name__ == "__main__":
-    demo.launch()

 import os
 import gradio as gr
 from langchain_community.vectorstores import FAISS
 from langchain_community.document_loaders import PyMuPDFLoader
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.chains import RetrievalQA
+from langchain_community.llms import HuggingFaceHub
+from huggingface_hub import login
+# 1. Authentication (MUST HAVE)
+login(token=os.environ.get('HF_TOKEN'))
+# 2. PDF Processing Function
+def create_qa_system():
+    # File check
+    if not os.path.exists("data.pdf"):
+        raise gr.Error("❌ data.pdf not found! Upload it in Space's Files tab")
+    # Load PDF
+    loader = PyMuPDFLoader("data.pdf")
+    documents = loader.load()
+    # Split text
+    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+    texts = text_splitter.split_documents(documents)
+    # Create embeddings
     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    # Build vector store
+    db = FAISS.from_documents(texts, embeddings)
+    # Initialize LLM (Free-tier compatible)
     llm = HuggingFaceHub(
+        repo_id="google/flan-t5-base",  # Changed to smaller model
+        model_kwargs={"temperature": 0.2, "max_length": 256}
     )
+    return RetrievalQA.from_chain_type(
         llm=llm,
         chain_type="stuff",
+        retriever=db.as_retriever(search_kwargs={"k": 2})
     )
+# 3. Initialize system
+qa = create_qa_system()
+# 4. Chat interface
+def chat(message, history):
+    response = qa({"query": message})
     return response["result"]
+# 5. Launch Gradio
+gr.ChatInterface(
+    chat,
+    title="PDF Chatbot",
+    description="Upload your PDF in Files tab ➡️ Ask questions!",
+).launch()