findthehead commited on
Commit
532ca99
·
0 Parent(s):

Fresh start without PDFs

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +48 -0
  3. app.py +220 -0
  4. requirements.txt +8 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Research Parrot
3
+ emoji: 🦜
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.42.0
8
+ app_file: app.py
9
+ pinned: false
10
+ hf_oauth: true
11
+ hf_oauth_scopes:
12
+ - inference-api
13
+ ---
14
+
15
+ # 🦜 Research Parrot
16
+
17
+ An AI-powered research paper assistant for security researchers. Ask questions about security research papers and get in-depth technical analysis.
18
+
19
+ ## Features
20
+
21
+ - **RAG-based Q&A**: Query your research papers using semantic search powered by Pinecone
22
+ - **Security-focused**: Tailored responses for security researchers with technical depth
23
+ - **LaTeX Support**: Properly renders mathematical formulas and equations
24
+ - **HuggingFace Inference**: Uses open-source LLMs via HuggingFace Inference API
25
+
26
+ ## Tech Stack
27
+
28
+ - [Gradio](https://gradio.app) - Web interface
29
+ - [HuggingFace Hub](https://huggingface.co/docs/huggingface_hub) - LLM inference
30
+ - [LangChain](https://langchain.com) - RAG framework
31
+ - [Pinecone](https://pinecone.io) - Vector database
32
+
33
+ ## Configuration
34
+
35
+ Set these secrets in your Hugging Face Space settings:
36
+
37
+ | Secret | Description |
38
+ |--------|-------------|
39
+ | `HF_TOKEN` | Your Hugging Face API token |
40
+ | `PINECONE_API_KEY` | Your Pinecone API key |
41
+
42
+ ## Usage
43
+
44
+ Simply type your question about security research topics like:
45
+ - "What is prompt injection?"
46
+ - "Tell me about jailbreaking techniques"
47
+ - "Explain RAG architecture"
48
+ - "What are the main attack vectors discussed?"
app.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import gradio as gr
4
+ from huggingface_hub import InferenceClient
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
7
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
8
+ from langchain_pinecone import PineconeVectorStore
9
+ from pinecone import Pinecone
10
+
11
+ # For local development, uncomment the following:
12
+ # from dotenv import load_dotenv
13
+ # load_dotenv()
14
+
15
+ # Default model - can be changed to any HF model
16
+ DEFAULT_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
17
+
18
+
19
+ class ResearchParrot:
20
+ def __init__(self, model_id: str = DEFAULT_MODEL):
21
+ self.model_id = model_id
22
+ self.client = InferenceClient(token=os.getenv("HF_TOKEN"))
23
+ self._vectorstore = None
24
+
25
+ def embeddings(self):
26
+ return HuggingFaceInferenceAPIEmbeddings(
27
+ api_key=os.getenv("HF_TOKEN"),
28
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
29
+ )
30
+
31
+ def load_docs_from_files(self, file_paths: list):
32
+ """Load documents from uploaded PDF files"""
33
+ docs = []
34
+ for filepath in file_paths:
35
+ if filepath and filepath.endswith('.pdf'):
36
+ loader = PyPDFLoader(filepath)
37
+ docs.extend(loader.load())
38
+ return docs
39
+
40
+ def split_docs(self, docs):
41
+ text_splitter = RecursiveCharacterTextSplitter(
42
+ chunk_size=1000, chunk_overlap=200, add_start_index=True
43
+ )
44
+ return text_splitter.split_documents(docs)
45
+
46
+ def vectorstore(self):
47
+ if self._vectorstore is None:
48
+ pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
49
+ index = pc.Index("parrot")
50
+ self._vectorstore = PineconeVectorStore(embedding=self.embeddings(), index=index)
51
+ return self._vectorstore
52
+
53
+ def ingest(self, file_paths: list):
54
+ """Ingest uploaded PDF files into the vector store"""
55
+ docs = self.load_docs_from_files(file_paths)
56
+ if not docs:
57
+ return "No valid PDF documents found to ingest."
58
+
59
+ split_docs = self.split_docs(docs)
60
+ store = self.vectorstore()
61
+ ids = store.add_documents(documents=split_docs)
62
+ return f"Successfully ingested {len(ids)} document chunks from {len(file_paths)} PDF(s)."
63
+
64
+ def query(self, question: str):
65
+ if not question.strip():
66
+ return "Please enter a question."
67
+
68
+ store = self.vectorstore()
69
+ docs = store.similarity_search(question, k=5)
70
+
71
+ if not docs:
72
+ return "No relevant documents found. Please upload and ingest some PDFs first."
73
+
74
+ context = "\n\n".join([doc.page_content for doc in docs])
75
+
76
+ prompt = f"""You are a research assistant. Answer the question ONLY based on the provided context.
77
+
78
+ IMPORTANT RULES:
79
+ - Only use information from the context below and Make a Step by Step approach.
80
+ - If the context doesn't contain enough information to answer, say "I don't have enough information in the documents to answer this question."
81
+ - Always make it more technical in depth as much as you can because your readers are security researchers not normal people.
82
+ - Always highlight the attack technique, payload, math formula properly if available.
83
+
84
+ Context:
85
+ {context}
86
+
87
+ Question: {question}
88
+
89
+ Answer:"""
90
+
91
+ response = self.client.text_generation(
92
+ prompt,
93
+ model=self.model_id,
94
+ max_new_tokens=1024,
95
+ temperature=0.7,
96
+ do_sample=True,
97
+ )
98
+ return response
99
+
100
+
101
+ # Initialize the app
102
+ app = ResearchParrot()
103
+
104
+
105
+ def chat(message, history):
106
+ """Chat function for the Gradio interface"""
107
+ try:
108
+ response = app.query(message)
109
+ return response
110
+ except Exception as e:
111
+ return f"Error: {str(e)}. Please check that API keys are configured correctly."
112
+
113
+
114
+ def upload_and_ingest(files):
115
+ """Handle file upload and ingestion"""
116
+ if not files:
117
+ return "No files uploaded."
118
+
119
+ try:
120
+ file_paths = [f.name for f in files]
121
+ result = app.ingest(file_paths)
122
+ return result
123
+ except Exception as e:
124
+ return f"Error during ingestion: {str(e)}"
125
+
126
+
127
+ # Build Gradio Interface for Hugging Face Spaces
128
+ with gr.Blocks(theme=gr.themes.Soft(), title="Research Parrot") as demo:
129
+ gr.Markdown(
130
+ """
131
+ # Research Parrot
132
+ ### AI-Powered Research Paper Assistant
133
+
134
+ Upload your research papers (PDFs) and ask questions about them.
135
+ Perfect for security researchers who need in-depth technical analysis.
136
+ """
137
+ )
138
+
139
+ with gr.Tab("💬 Chat"):
140
+ chatbot = gr.Chatbot(
141
+ label="Research Assistant",
142
+ height=500,
143
+ latex_delimiters=[
144
+ {"left": "$$", "right": "$$", "display": True},
145
+ {"left": "$", "right": "$", "display": False},
146
+ {"left": "\\[", "right": "\\]", "display": True},
147
+ {"left": "\\(", "right": "\\)", "display": False},
148
+ ]
149
+ )
150
+
151
+ msg = gr.Textbox(
152
+ label="Your Question",
153
+ placeholder="Ask about your research papers...",
154
+ lines=2
155
+ )
156
+
157
+ with gr.Row():
158
+ submit_btn = gr.Button("Send", variant="primary")
159
+ clear_btn = gr.Button("Clear")
160
+
161
+ gr.Examples(
162
+ examples=[
163
+ "Tell me about jailbreaking?",
164
+ "What is prompt injection?",
165
+ "Explain RAG architecture",
166
+ "What are the main attack vectors discussed?",
167
+ "Summarize the key findings"
168
+ ],
169
+ inputs=msg
170
+ )
171
+
172
+ def respond(message, chat_history):
173
+ bot_message = chat(message, chat_history)
174
+ chat_history.append((message, bot_message))
175
+ return "", chat_history
176
+
177
+ msg.submit(respond, [msg, chatbot], [msg, chatbot])
178
+ submit_btn.click(respond, [msg, chatbot], [msg, chatbot])
179
+ clear_btn.click(lambda: None, None, chatbot, queue=False)
180
+
181
+ with gr.Tab("📄 Upload Papers"):
182
+ gr.Markdown(
183
+ """
184
+ ### Upload Research Papers
185
+ Upload PDF files to add them to the knowledge base.
186
+ The papers will be processed and indexed for querying.
187
+ """
188
+ )
189
+
190
+ file_upload = gr.File(
191
+ label="Upload PDFs",
192
+ file_count="multiple",
193
+ file_types=[".pdf"],
194
+ type="filepath"
195
+ )
196
+
197
+ ingest_btn = gr.Button("Process & Index Papers", variant="primary")
198
+ ingest_output = gr.Textbox(label="Status", interactive=False)
199
+
200
+ ingest_btn.click(
201
+ fn=upload_and_ingest,
202
+ inputs=file_upload,
203
+ outputs=ingest_output
204
+ )
205
+
206
+ gr.Markdown(
207
+ """
208
+ ---
209
+ **Note:** Make sure to configure your `HF_TOKEN` and `PINECONE_API_KEY`
210
+ in the Hugging Face Space secrets.
211
+ """
212
+ )
213
+
214
+ # Launch configuration for Hugging Face Spaces
215
+ if __name__ == "__main__":
216
+ demo.launch(
217
+ server_name="0.0.0.0",
218
+ server_port=7860,
219
+ share=False
220
+ )
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ huggingface_hub>=0.20.0
3
+ langchain>=0.1.0
4
+ langchain-community>=0.0.10
5
+ langchain-pinecone>=0.0.1
6
+ pinecone-client>=3.0.0
7
+ pypdf>=3.0.0
8
+ sentence-transformers>=2.2.0