File size: 8,763 Bytes
3da159d
73de916
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9978c4b
 
 
 
538c8b3
 
73de916
 
a6d8a05
 
 
 
73de916
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78efe2b
73de916
 
 
 
 
78efe2b
73de916
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3da159d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""
import os
import requests
import torch
import gradio as gr
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import ServiceContext, SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts.prompts import SimpleInputPrompt
from llama_index.embeddings import LangchainEmbedding

def download_pdf_from_url(url, save_path="/content/Data/input.pdf"):
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print(f"PDF downloaded and saved to {save_path}")
    else:
        print(f"Failed to download PDF. Status code: {response.status_code}")

def mod(pdf_url):
    if not os.path.exists("/Data/"):                         # /content/Data --> /Data/
        os.makedirs("/Data/")                                # /content/Data --> /Data/
    download_pdf_from_url(pdf_url)                           # /content/Data --> /Data/
    documents = SimpleDirectoryReader("/Data/").load_data()
    system_prompt = '''You are an expert share market document summarizer specializing in creating concise, comprehensive summaries tailored for professional audiences. Your goal is to summarize pdf which may also include tabular columns, as
accurately as possible based on the instructions and context provided.'''
    query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")
    from huggingface_hub import login
    hf_token = os.environ.get('HF_TOKEN')
    if not hf_token:
        raise ValueError("HF_TOKEN environment variable not found. Please set it in your Space settings.")
    login(token=hf_token)
    llm = HuggingFaceLLM(
        context_window=4096,
        max_new_tokens=750,
        generate_kwargs={"temperature": 0.5, "do_sample": False},
        system_prompt=system_prompt,
        query_wrapper_prompt=query_wrapper_prompt,
        tokenizer_name="mistralai/Mistral-7B-Instruct-v0.1",
        model_name="mistralai/Mistral-7B-Instruct-v0.1",
        device_map="auto",
        model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
    )
    embed_model = LangchainEmbedding(
        HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    )
    service_context = ServiceContext.from_defaults(
        chunk_size=1024,
        llm=llm,
        embed_model=embed_model
    )
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)
    query_engine = index.as_query_engine()
    response = query_engine.query('''You are an expert share market document summarizer specializing in creating concise, comprehensive summaries tailored for professional audiences. Your task is to analyze the given document and generate a structured summary in approximately 500 words. Ensure the summary:
Captures all key points, including data, insights, and observations.
Clearly outlines the context, such as the purpose of the document and relevant background information.
Summarizes tabular data and numerical figures effectively, while retaining accuracy and relevance.
Highlights significant trends, comparisons, or impacts mentioned in the document.
Uses formal and precise language suitable for a corporate or academic audience.
The output should be well-organized with clear headings or bullet points where applicable. Avoid omitting any critical information, and focus on maintaining a balance between brevity and detail.''')
    return str(response.response)

def func(url):
    return mod(url)

iface = gr.Interface(
        fn=func,
        inputs="text",
        outputs=gr.Textbox(
          label="Output Summary",
          placeholder="The summary will appear here . . .",
          lines=10, 
          interactive=False),
        examples=[['https://cdn-sn.samco.in/ec90fa5b637541d3c86fdb86f45d920c.pdf'],
                  ['https://cdn-sn.samco.in/7c8616b72b4aa639c0eda9f44285ab1d.pdf'],
                  ['https://cdn-sn.samco.in/a4b95bc0bdb8361459a8b41bfc0ff317.pdf']],
        flagging_options=["Useful", "Mediocre 50-50", "Not Useful"],
        description="Flag it for every response and classify it according to what you feel!"
    )

iface.launch(share=True, debug=True)
"""






import os
import requests
import torch
import gradio as gr
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts.prompts import SimpleInputPrompt
from llama_index.embeddings import LangchainEmbedding
import fitz  # PyMuPDF

# Function to process the PDF directly from URL
def process_pdf_from_url(pdf_url):
    response = requests.get(pdf_url)
    if response.status_code == 200:
        pdf_data = response.content
        doc = fitz.open(stream=pdf_data, filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text("text")  # Extract text from each page
        return text
    else:
        print(f"Failed to retrieve PDF. Status code: {response.status_code}")
        return ""

def mod(pdf_url):
    # Process the PDF directly from URL
    document_text = process_pdf_from_url(pdf_url)
    if not document_text:
        return "Failed to process the PDF."
    
    documents = [document_text]  # Just using the text directly
    
    system_prompt = """You are an expert share market document summarizer specializing in creating concise, comprehensive summaries tailored for professional audiences. Your goal is to summarize pdf which may also include tabular columns, as accurately as possible based on the instructions and context provided."""
    query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

    # Hugging Face Token
    from huggingface_hub import login
    hf_token = os.environ.get('HF_TOKEN')
    if not hf_token:
        raise ValueError("HF_TOKEN environment variable not found. Please set it in your Space settings.")
    login(token=hf_token)

    # Define the LLM and embeddings models
    llm = HuggingFaceLLM(
        context_window=4096,
        max_new_tokens=750,
        generate_kwargs={"temperature": 0.5, "do_sample": False},
        system_prompt=system_prompt,
        query_wrapper_prompt=query_wrapper_prompt,
        tokenizer_name="mistralai/Mistral-7B-Instruct-v0.1",
        model_name="mistralai/Mistral-7B-Instruct-v0.1",
        device_map="auto",
        model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
    )

    embed_model = LangchainEmbedding(
        HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    )

    # Create service context and index
    service_context = ServiceContext.from_defaults(
        chunk_size=1024,
        llm=llm,
        embed_model=embed_model
    )

    # Indexing the document
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)
    query_engine = index.as_query_engine()

    # Query to generate summary
    response = query_engine.query("""You are an expert share market document summarizer specializing in creating concise, comprehensive summaries tailored for professional audiences. Your task is to analyze the given document and generate a structured summary in approximately 500 words. Ensure the summary:
    - Captures all key points, including data, insights, and observations.
    - Clearly outlines the context, such as the purpose of the document and relevant background information.
    - Summarizes tabular data and numerical figures effectively, while retaining accuracy and relevance.
    - Highlights significant trends, comparisons, or impacts mentioned in the document.
    - Uses formal and precise language suitable for a corporate or academic audience.
    The output should be well-organized with clear headings or bullet points where applicable. Avoid omitting any critical information, and focus on maintaining a balance between brevity and detail.""")

    return str(response.response)

# Gradio Interface
def func(url):
    return mod(url)

iface = gr.Interface(
    fn=func,
    inputs="text",
    outputs=gr.Textbox(
        label="Output Summary",
        placeholder="The summary will appear here . . .",
        lines=10,
        interactive=False
    ),
    examples=[
        ['https://cdn-sn.samco.in/ec90fa5b637541d3c86fdb86f45d920c.pdf'],
        ['https://cdn-sn.samco.in/7c8616b72b4aa639c0eda9f44285ab1d.pdf'],
        ['https://cdn-sn.samco.in/a4b95bc0bdb8361459a8b41bfc0ff317.pdf']
    ],
    flagging_options=["Useful", "Mediocre 50-50", "Not Useful"],
    description="Flag it for every response and classify it according to what you feel!"
)

iface.launch(share=True, debug=True)