PraneshJs commited on
Commit
0cf7f07
·
verified ·
1 Parent(s): b9af798

fixed dict issue

Browse files
Files changed (1) hide show
  1. app.py +74 -72
app.py CHANGED
@@ -10,13 +10,14 @@ from langchain_community.vectorstores import FAISS
10
  from langchain_huggingface import HuggingFaceEmbeddings
11
  from threading import Thread
12
  from dotenv import load_dotenv
 
13
 
14
  load_dotenv()
15
 
16
  # === CONFIG ===
17
  STORAGE_DIR = "storage"
18
- CLEANUP_INTERVAL = 600 # 10 min
19
- SESSION_TTL = 1000 # 30 min
20
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
21
  OPENROUTER_MODEL = "z-ai/glm-4.5-air:free"
22
 
@@ -39,106 +40,111 @@ Thread(target=cleanup_old_sessions, daemon=True).start()
39
  def process_pdf(pdf_file):
40
  if pdf_file is None:
41
  return "No file uploaded.", "", []
 
42
  session_id = str(uuid.uuid4())
43
  reader = PdfReader(pdf_file.name)
44
 
45
- # Extract text
46
- text = "".join([page.extract_text() for page in reader.pages if page.extract_text()])
 
 
47
 
48
- # Metadata
49
- page_count = len(reader.pages)
50
- first_page_text = reader.pages[0].extract_text() if page_count > 0 else ""
51
- guessed_title = first_page_text.split("\n")[0] if first_page_text else "Unknown Title"
52
 
53
- # Split text
54
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
55
  chunks = splitter.split_text(text)
56
 
57
- # Embeddings + FAISS
58
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
59
  session_path = os.path.join(STORAGE_DIR, session_id)
60
  os.makedirs(session_path, exist_ok=True)
 
61
  db = FAISS.from_texts(chunks, embeddings)
62
  db.save_local(session_path)
63
 
64
- # Save metadata
65
- metadata_path = os.path.join(session_path, "metadata.txt")
66
- with open(metadata_path, "w", encoding="utf-8") as f:
67
- f.write(f"title={guessed_title}\n")
68
- f.write(f"pages={page_count}\n")
 
 
 
 
69
 
70
- chat_history = [("System", f"Paper uploaded. Title: {guessed_title}, Pages: {page_count}. You can now ask questions.")]
 
 
71
  return f"Paper uploaded successfully. Session ID: {session_id}", session_id, chat_history
72
 
73
  # === QUERY FUNCTION ===
74
  def query_paper(session_id, user_message, chat_history):
75
  if not session_id or not os.path.exists(os.path.join(STORAGE_DIR, session_id)):
76
  chat_history = chat_history or []
77
- chat_history.append(("System", "Session expired or not found. Upload the paper again."))
78
  return chat_history, ""
79
 
80
  if not user_message.strip():
81
  return chat_history, ""
82
 
83
  session_path = os.path.join(STORAGE_DIR, session_id)
84
-
85
- # Load metadata
86
- metadata_path = os.path.join(session_path, "metadata.txt")
87
- metadata = {}
88
- if os.path.exists(metadata_path):
89
- with open(metadata_path, "r", encoding="utf-8") as f:
90
- for line in f:
91
- k, v = line.strip().split("=", 1)
92
- metadata[k] = v
93
-
94
- # Load retriever
95
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
96
  db = FAISS.load_local(session_path, embeddings, allow_dangerous_deserialization=True)
97
  retriever = db.as_retriever(search_kwargs={"k": 3})
98
 
99
- # Retrieve context
100
- docs = retriever.invoke(user_message)
101
- context = "\n\n".join([d.page_content for d in docs])
102
-
103
- # Prompt
104
- prompt = f"""
105
- You are an AI assistant that explains research papers in clear, structured, simple terms.
106
- You can use BOTH metadata and the paper content.
107
-
108
- Metadata:
109
- - Title: {metadata.get('title','Unknown')}
110
- - Pages: {metadata.get('pages','Unknown')}
111
-
112
- Paper content (retrieved chunks):
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  {context}
114
 
115
- User Question: {user_message}
116
- Answer in plain English with clarity.
117
  """
118
-
119
- headers = {"Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json"}
120
- payload = {
121
- "model": OPENROUTER_MODEL,
122
- "messages": [
123
- {"role": "system", "content": "You are a helpful research paper explainer. Use metadata if the user asks about title, authors, or page count. Otherwise, use the retrieved context."},
124
- {"role": "user", "content": prompt}
125
- ]
126
- }
127
-
128
- try:
129
- response = requests.post("https://openrouter.ai/api/v1/chat/completions",
130
- headers=headers, json=payload)
131
-
132
- if response.status_code == 200:
133
- answer = response.json()["choices"][0]["message"]["content"].strip()
134
- else:
135
- answer = f"Error: {response.status_code} - {response.text}"
136
- except Exception as e:
137
- answer = f"Error: {str(e)}"
138
 
139
  chat_history = chat_history or []
140
- chat_history.append((user_message, answer))
141
-
142
  return chat_history, ""
143
 
144
  # === GRADIO UI ===
@@ -149,7 +155,7 @@ with gr.Blocks() as demo:
149
  pdf_input = gr.File(label="Upload Research Paper (PDF)", file_types=[".pdf"])
150
  session_box = gr.Textbox(label="Session ID", interactive=False)
151
 
152
- chatbot = gr.Chatbot(label="Chat about your paper", height=400, type="messages")
153
  user_message = gr.Textbox(label="Ask a question", placeholder="What is this paper about?")
154
 
155
  with gr.Row():
@@ -157,11 +163,9 @@ with gr.Blocks() as demo:
157
  ask_btn = gr.Button("Send Question")
158
  clear_btn = gr.Button("Clear Chat")
159
 
160
- # Store chat history and session
161
  state_chat = gr.State([])
162
  state_session = gr.State("")
163
 
164
- # Upload
165
  def handle_upload(pdf_file):
166
  status, session_id, chat_history = process_pdf(pdf_file)
167
  return status, session_id, chat_history
@@ -172,7 +176,6 @@ with gr.Blocks() as demo:
172
  outputs=[session_box, state_session, state_chat]
173
  )
174
 
175
- # Ask
176
  def handle_question(session_id, message, chat_history):
177
  updated_chat, _ = query_paper(session_id, message, chat_history)
178
  return updated_chat, ""
@@ -197,7 +200,6 @@ with gr.Blocks() as demo:
197
  outputs=[state_chat]
198
  )
199
 
200
- # Clear
201
  def clear_chat():
202
  return [], []
203
 
@@ -212,4 +214,4 @@ with gr.Blocks() as demo:
212
  outputs=[chatbot]
213
  )
214
 
215
- demo.launch(debug=True)
 
10
  from langchain_huggingface import HuggingFaceEmbeddings
11
  from threading import Thread
12
  from dotenv import load_dotenv
13
+ import json
14
 
15
  load_dotenv()
16
 
17
  # === CONFIG ===
18
  STORAGE_DIR = "storage"
19
+ CLEANUP_INTERVAL = 600
20
+ SESSION_TTL = 1000
21
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
22
  OPENROUTER_MODEL = "z-ai/glm-4.5-air:free"
23
 
 
40
  def process_pdf(pdf_file):
41
  if pdf_file is None:
42
  return "No file uploaded.", "", []
43
+
44
  session_id = str(uuid.uuid4())
45
  reader = PdfReader(pdf_file.name)
46
 
47
+ metadata = reader.metadata or {}
48
+ num_pages = len(reader.pages)
49
+ title = metadata.get("/Title", "Unknown Title")
50
+ author = metadata.get("/Author", "Unknown Author")
51
 
52
+ text = "".join([page.extract_text() for page in reader.pages if page.extract_text()])
 
 
 
53
 
 
54
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
55
  chunks = splitter.split_text(text)
56
 
 
57
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
58
  session_path = os.path.join(STORAGE_DIR, session_id)
59
  os.makedirs(session_path, exist_ok=True)
60
+
61
  db = FAISS.from_texts(chunks, embeddings)
62
  db.save_local(session_path)
63
 
64
+ meta_data = {
65
+ "title": title,
66
+ "author": author,
67
+ "pages": num_pages,
68
+ "session_id": session_id,
69
+ "created_at": time.ctime()
70
+ }
71
+ with open(os.path.join(session_path, "metadata.json"), "w") as f:
72
+ json.dump(meta_data, f)
73
 
74
+ chat_history = [
75
+ {"role": "system", "content": f"📄 Paper uploaded.\nTitle: {title}\nAuthor: {author}\nPages: {num_pages}"}
76
+ ]
77
  return f"Paper uploaded successfully. Session ID: {session_id}", session_id, chat_history
78
 
79
  # === QUERY FUNCTION ===
80
  def query_paper(session_id, user_message, chat_history):
81
  if not session_id or not os.path.exists(os.path.join(STORAGE_DIR, session_id)):
82
  chat_history = chat_history or []
83
+ chat_history.append({"role": "system", "content": "Session expired or not found. Upload the paper again."})
84
  return chat_history, ""
85
 
86
  if not user_message.strip():
87
  return chat_history, ""
88
 
89
  session_path = os.path.join(STORAGE_DIR, session_id)
 
 
 
 
 
 
 
 
 
 
 
90
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
91
  db = FAISS.load_local(session_path, embeddings, allow_dangerous_deserialization=True)
92
  retriever = db.as_retriever(search_kwargs={"k": 3})
93
 
94
+ metadata_path = os.path.join(session_path, "metadata.json")
95
+ if os.path.exists(metadata_path):
96
+ with open(metadata_path, "r") as f:
97
+ metadata = json.load(f)
98
+ else:
99
+ metadata = {"title": "Unknown", "author": "Unknown", "pages": "Unknown"}
100
+
101
+ lower_q = user_message.lower()
102
+ if "title" in lower_q or "name of this paper" in lower_q:
103
+ answer = f"The title of this paper is: **{metadata['title']}**."
104
+ elif "author" in lower_q or "who wrote" in lower_q:
105
+ answer = f"The author of this paper is: **{metadata['author']}**."
106
+ elif "pages" in lower_q or "how many pages" in lower_q:
107
+ answer = f"This paper has **{metadata['pages']} pages**."
108
+ else:
109
+ docs = retriever.invoke(user_message)
110
+ context = "\n\n".join([d.page_content for d in docs])
111
+
112
+ prompt = f"""
113
+ You are an AI research assistant. Use the paper content and metadata to answer clearly.
114
+
115
+ Paper Metadata:
116
+ - Title: {metadata['title']}
117
+ - Author: {metadata['author']}
118
+ - Pages: {metadata['pages']}
119
+
120
+ Context from paper:
121
  {context}
122
 
123
+ Question: {user_message}
124
+ Answer:
125
  """
126
+ headers = {"Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json"}
127
+ payload = {
128
+ "model": OPENROUTER_MODEL,
129
+ "messages": [
130
+ {"role": "system", "content": "You are a helpful research explainer. Always use metadata if available."},
131
+ {"role": "user", "content": prompt}
132
+ ]
133
+ }
134
+
135
+ try:
136
+ response = requests.post("https://openrouter.ai/api/v1/chat/completions",
137
+ headers=headers, json=payload)
138
+ if response.status_code == 200:
139
+ answer = response.json()["choices"][0]["message"]["content"].strip()
140
+ else:
141
+ answer = f"Error: {response.status_code} - {response.text}"
142
+ except Exception as e:
143
+ answer = f"Error: {str(e)}"
 
 
144
 
145
  chat_history = chat_history or []
146
+ chat_history.append({"role": "user", "content": user_message})
147
+ chat_history.append({"role": "assistant", "content": answer})
148
  return chat_history, ""
149
 
150
  # === GRADIO UI ===
 
155
  pdf_input = gr.File(label="Upload Research Paper (PDF)", file_types=[".pdf"])
156
  session_box = gr.Textbox(label="Session ID", interactive=False)
157
 
158
+ chatbot = gr.Chatbot(label="Chat about your paper", type="messages", height=400)
159
  user_message = gr.Textbox(label="Ask a question", placeholder="What is this paper about?")
160
 
161
  with gr.Row():
 
163
  ask_btn = gr.Button("Send Question")
164
  clear_btn = gr.Button("Clear Chat")
165
 
 
166
  state_chat = gr.State([])
167
  state_session = gr.State("")
168
 
 
169
  def handle_upload(pdf_file):
170
  status, session_id, chat_history = process_pdf(pdf_file)
171
  return status, session_id, chat_history
 
176
  outputs=[session_box, state_session, state_chat]
177
  )
178
 
 
179
  def handle_question(session_id, message, chat_history):
180
  updated_chat, _ = query_paper(session_id, message, chat_history)
181
  return updated_chat, ""
 
200
  outputs=[state_chat]
201
  )
202
 
 
203
  def clear_chat():
204
  return [], []
205
 
 
214
  outputs=[chatbot]
215
  )
216
 
217
+ demo.launch(debug=True, server_name="0.0.0.0", server_port=7860)