Spaces:

Prachir-AI
/

researchparrot

Sleeping

App Files Files Community

findthehead commited on 22 days ago

Commit

532ca99

0 Parent(s):

Fresh start without PDFs

Browse files

Files changed (4) hide show

.gitattributes +35 -0
README.md +48 -0
app.py +220 -0
requirements.txt +8 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,48 @@

+---
+title: Research Parrot
+emoji: 🦜
+colorFrom: yellow
+colorTo: purple
+sdk: gradio
+sdk_version: 5.42.0
+app_file: app.py
+pinned: false
+hf_oauth: true
+hf_oauth_scopes:
+- inference-api
+---
+# 🦜 Research Parrot
+An AI-powered research paper assistant for security researchers. Ask questions about security research papers and get in-depth technical analysis.
+## Features
+- **RAG-based Q&A**: Query your research papers using semantic search powered by Pinecone
+- **Security-focused**: Tailored responses for security researchers with technical depth
+- **LaTeX Support**: Properly renders mathematical formulas and equations
+- **HuggingFace Inference**: Uses open-source LLMs via HuggingFace Inference API
+## Tech Stack
+- [Gradio](https://gradio.app) - Web interface
+- [HuggingFace Hub](https://huggingface.co/docs/huggingface_hub) - LLM inference
+- [LangChain](https://langchain.com) - RAG framework
+- [Pinecone](https://pinecone.io) - Vector database
+## Configuration
+Set these secrets in your Hugging Face Space settings:
+| Secret | Description |
+|--------|-------------|
+| `HF_TOKEN` | Your Hugging Face API token |
+| `PINECONE_API_KEY` | Your Pinecone API key |
+## Usage
+Simply type your question about security research topics like:
+- "What is prompt injection?"
+- "Tell me about jailbreaking techniques"
+- "Explain RAG architecture"
+- "What are the main attack vectors discussed?"

app.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import os
+import tempfile
+import gradio as gr
+from huggingface_hub import InferenceClient
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_pinecone import PineconeVectorStore
+from pinecone import Pinecone
+# For local development, uncomment the following:
+# from dotenv import load_dotenv
+# load_dotenv()
+# Default model - can be changed to any HF model
+DEFAULT_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
+class ResearchParrot:
+    def __init__(self, model_id: str = DEFAULT_MODEL):
+        self.model_id = model_id
+        self.client = InferenceClient(token=os.getenv("HF_TOKEN"))
+        self._vectorstore = None
+    def embeddings(self):
+        return HuggingFaceInferenceAPIEmbeddings(
+            api_key=os.getenv("HF_TOKEN"),
+            model_name="sentence-transformers/all-MiniLM-L6-v2"
+        )
+    def load_docs_from_files(self, file_paths: list):
+        """Load documents from uploaded PDF files"""
+        docs = []
+        for filepath in file_paths:
+            if filepath and filepath.endswith('.pdf'):
+                loader = PyPDFLoader(filepath)
+                docs.extend(loader.load())
+        return docs
+    def split_docs(self, docs):
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000, chunk_overlap=200, add_start_index=True
+        )
+        return text_splitter.split_documents(docs)
+    def vectorstore(self):
+        if self._vectorstore is None:
+            pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+            index = pc.Index("parrot")
+            self._vectorstore = PineconeVectorStore(embedding=self.embeddings(), index=index)
+        return self._vectorstore
+    def ingest(self, file_paths: list):
+        """Ingest uploaded PDF files into the vector store"""
+        docs = self.load_docs_from_files(file_paths)
+        if not docs:
+            return "No valid PDF documents found to ingest."
+        split_docs = self.split_docs(docs)
+        store = self.vectorstore()
+        ids = store.add_documents(documents=split_docs)
+        return f"Successfully ingested {len(ids)} document chunks from {len(file_paths)} PDF(s)."
+    def query(self, question: str):
+        if not question.strip():
+            return "Please enter a question."
+        store = self.vectorstore()
+        docs = store.similarity_search(question, k=5)
+        if not docs:
+            return "No relevant documents found. Please upload and ingest some PDFs first."
+        context = "\n\n".join([doc.page_content for doc in docs])
+        prompt = f"""You are a research assistant. Answer the question ONLY based on the provided context.
+IMPORTANT RULES:
+- Only use information from the context below and Make a Step by Step approach.
+- If the context doesn't contain enough information to answer, say "I don't have enough information in the documents to answer this question."
+- Always make it more technical in depth as much as you can because your readers are security researchers not normal people.
+- Always highlight the attack technique, payload, math formula properly if available.
+Context:
+{context}
+Question: {question}
+Answer:"""
+        response = self.client.text_generation(
+            prompt,
+            model=self.model_id,
+            max_new_tokens=1024,
+            temperature=0.7,
+            do_sample=True,
+        )
+        return response
+# Initialize the app
+app = ResearchParrot()
+def chat(message, history):
+    """Chat function for the Gradio interface"""
+    try:
+        response = app.query(message)
+        return response
+    except Exception as e:
+        return f"Error: {str(e)}. Please check that API keys are configured correctly."
+def upload_and_ingest(files):
+    """Handle file upload and ingestion"""
+    if not files:
+        return "No files uploaded."
+    try:
+        file_paths = [f.name for f in files]
+        result = app.ingest(file_paths)
+        return result
+    except Exception as e:
+        return f"Error during ingestion: {str(e)}"
+# Build Gradio Interface for Hugging Face Spaces
+with gr.Blocks(theme=gr.themes.Soft(), title="Research Parrot") as demo:
+    gr.Markdown(
+        """
+        # Research Parrot
+        ### AI-Powered Research Paper Assistant
+        Upload your research papers (PDFs) and ask questions about them.
+        Perfect for security researchers who need in-depth technical analysis.
+        """
+    )
+    with gr.Tab("💬 Chat"):
+        chatbot = gr.Chatbot(
+            label="Research Assistant",
+            height=500,
+            latex_delimiters=[
+                {"left": "$$", "right": "$$", "display": True},
+                {"left": "$", "right": "$", "display": False},
+                {"left": "\\[", "right": "\\]", "display": True},
+                {"left": "\\(", "right": "\\)", "display": False},
+            ]
+        )
+        msg = gr.Textbox(
+            label="Your Question",
+            placeholder="Ask about your research papers...",
+            lines=2
+        )
+        with gr.Row():
+            submit_btn = gr.Button("Send", variant="primary")
+            clear_btn = gr.Button("Clear")
+        gr.Examples(
+            examples=[
+                "Tell me about jailbreaking?",
+                "What is prompt injection?",
+                "Explain RAG architecture",
+                "What are the main attack vectors discussed?",
+                "Summarize the key findings"
+            ],
+            inputs=msg
+        )
+        def respond(message, chat_history):
+            bot_message = chat(message, chat_history)
+            chat_history.append((message, bot_message))
+            return "", chat_history
+        msg.submit(respond, [msg, chatbot], [msg, chatbot])
+        submit_btn.click(respond, [msg, chatbot], [msg, chatbot])
+        clear_btn.click(lambda: None, None, chatbot, queue=False)
+    with gr.Tab("📄 Upload Papers"):
+        gr.Markdown(
+            """
+            ### Upload Research Papers
+            Upload PDF files to add them to the knowledge base.
+            The papers will be processed and indexed for querying.
+            """
+        )
+        file_upload = gr.File(
+            label="Upload PDFs",
+            file_count="multiple",
+            file_types=[".pdf"],
+            type="filepath"
+        )
+        ingest_btn = gr.Button("Process & Index Papers", variant="primary")
+        ingest_output = gr.Textbox(label="Status", interactive=False)
+        ingest_btn.click(
+            fn=upload_and_ingest,
+            inputs=file_upload,
+            outputs=ingest_output
+        )
+    gr.Markdown(
+        """
+        ---
+        **Note:** Make sure to configure your `HF_TOKEN` and `PINECONE_API_KEY`
+        in the Hugging Face Space secrets.
+        """
+    )
+# Launch configuration for Hugging Face Spaces
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio>=4.0.0
+huggingface_hub>=0.20.0
+langchain>=0.1.0
+langchain-community>=0.0.10
+langchain-pinecone>=0.0.1
+pinecone-client>=3.0.0
+pypdf>=3.0.0
+sentence-transformers>=2.2.0