cipherunhsiv commited on
Commit
3da159d
·
verified ·
1 Parent(s): 9978c4b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -1
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import requests
3
  import torch
@@ -78,4 +79,116 @@ iface = gr.Interface(
78
  description="Flag it for every response and classify it according to what you feel!"
79
  )
80
 
81
- iface.launch(share=True, debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
  import os
3
  import requests
4
  import torch
 
79
  description="Flag it for every response and classify it according to what you feel!"
80
  )
81
 
82
+ iface.launch(share=True, debug=True)
83
+ """
84
+
85
+
86
+
87
+
88
+
89
+
90
+ import os
91
+ import requests
92
+ import torch
93
+ import gradio as gr
94
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
95
+ from llama_index import ServiceContext, VectorStoreIndex
96
+ from llama_index.llms import HuggingFaceLLM
97
+ from llama_index.prompts.prompts import SimpleInputPrompt
98
+ from llama_index.embeddings import LangchainEmbedding
99
+ import fitz # PyMuPDF
100
+
101
+ # Function to process the PDF directly from URL
102
+ def process_pdf_from_url(pdf_url):
103
+ response = requests.get(pdf_url)
104
+ if response.status_code == 200:
105
+ pdf_data = response.content
106
+ doc = fitz.open(stream=pdf_data, filetype="pdf")
107
+ text = ""
108
+ for page in doc:
109
+ text += page.get_text("text") # Extract text from each page
110
+ return text
111
+ else:
112
+ print(f"Failed to retrieve PDF. Status code: {response.status_code}")
113
+ return ""
114
+
115
+ def mod(pdf_url):
116
+ # Process the PDF directly from URL
117
+ document_text = process_pdf_from_url(pdf_url)
118
+ if not document_text:
119
+ return "Failed to process the PDF."
120
+
121
+ documents = [document_text] # Just using the text directly
122
+
123
+ system_prompt = """You are an expert share market document summarizer specializing in creating concise, comprehensive summaries tailored for professional audiences. Your goal is to summarize pdf which may also include tabular columns, as accurately as possible based on the instructions and context provided."""
124
+ query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")
125
+
126
+ # Hugging Face Token
127
+ from huggingface_hub import login
128
+ hf_token = os.environ.get('HF_TOKEN')
129
+ if not hf_token:
130
+ raise ValueError("HF_TOKEN environment variable not found. Please set it in your Space settings.")
131
+ login(token=hf_token)
132
+
133
+ # Define the LLM and embeddings models
134
+ llm = HuggingFaceLLM(
135
+ context_window=4096,
136
+ max_new_tokens=750,
137
+ generate_kwargs={"temperature": 0.5, "do_sample": False},
138
+ system_prompt=system_prompt,
139
+ query_wrapper_prompt=query_wrapper_prompt,
140
+ tokenizer_name="mistralai/Mistral-7B-Instruct-v0.1",
141
+ model_name="mistralai/Mistral-7B-Instruct-v0.1",
142
+ device_map="auto",
143
+ model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
144
+ )
145
+
146
+ embed_model = LangchainEmbedding(
147
+ HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
148
+ )
149
+
150
+ # Create service context and index
151
+ service_context = ServiceContext.from_defaults(
152
+ chunk_size=1024,
153
+ llm=llm,
154
+ embed_model=embed_model
155
+ )
156
+
157
+ # Indexing the document
158
+ index = VectorStoreIndex.from_documents(documents, service_context=service_context)
159
+ query_engine = index.as_query_engine()
160
+
161
+ # Query to generate summary
162
+ response = query_engine.query("""You are an expert share market document summarizer specializing in creating concise, comprehensive summaries tailored for professional audiences. Your task is to analyze the given document and generate a structured summary in approximately 500 words. Ensure the summary:
163
+ - Captures all key points, including data, insights, and observations.
164
+ - Clearly outlines the context, such as the purpose of the document and relevant background information.
165
+ - Summarizes tabular data and numerical figures effectively, while retaining accuracy and relevance.
166
+ - Highlights significant trends, comparisons, or impacts mentioned in the document.
167
+ - Uses formal and precise language suitable for a corporate or academic audience.
168
+ The output should be well-organized with clear headings or bullet points where applicable. Avoid omitting any critical information, and focus on maintaining a balance between brevity and detail.""")
169
+
170
+ return str(response.response)
171
+
172
+ # Gradio Interface
173
+ def func(url):
174
+ return mod(url)
175
+
176
+ iface = gr.Interface(
177
+ fn=func,
178
+ inputs="text",
179
+ outputs=gr.Textbox(
180
+ label="Output Summary",
181
+ placeholder="The summary will appear here . . .",
182
+ lines=10,
183
+ interactive=False
184
+ ),
185
+ examples=[
186
+ ['https://cdn-sn.samco.in/ec90fa5b637541d3c86fdb86f45d920c.pdf'],
187
+ ['https://cdn-sn.samco.in/7c8616b72b4aa639c0eda9f44285ab1d.pdf'],
188
+ ['https://cdn-sn.samco.in/a4b95bc0bdb8361459a8b41bfc0ff317.pdf']
189
+ ],
190
+ flagging_options=["Useful", "Mediocre 50-50", "Not Useful"],
191
+ description="Flag it for every response and classify it according to what you feel!"
192
+ )
193
+
194
+ iface.launch(share=True, debug=True)