from dotenv import load_dotenv
from utils import read_files, split_document_by_tokens
from pathlib import Path
import os
load_dotenv()= os.getenv('OPENAI_API_KEY')
OPENAI_API_KEY = read_files(Path('../../content/books')) documents
A good way to work around token limitations. If you are summarizing text that is too long for your chosen model’s context window, Map Reduction can be used.
Map Reduction works like this: - The text is broken up into manageable pieces (chunks) - A summary is generated for each chunk - A final summary is generated from all the chunk summaries
This method is well suited for generating summaries of some types of text, but has some limitations: - The model may over or underemphazise certain aspects of the text - Gets really expensive really fast, as you need many calls to the LLM and lots of input tokens.
= """Write a short summary of the following text:
summary_map_template
{context}
SUMMARY:
"""
= """The following text is a set of summaries:
summary_reduce_template
{doc_summaries}
Create a cohesive summary from the above text.
SUMMARY:"""
from langchain.chains import LLMChain, ReduceDocumentsChain, MapReduceDocumentsChain, StuffDocumentsChain
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
def summarize_document(document: list[Document]):
# Chain to generate a summary from each chunk
= ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-4-0125-preview")
llm = PromptTemplate.from_template(summary_map_template)
map_prompt = LLMChain(prompt=map_prompt, llm=llm)
map_chain
# Chain to generate one cohesive summary from the summaries
= PromptTemplate.from_template(summary_reduce_template)
reduce_prompt = LLMChain(prompt=reduce_prompt, llm=llm)
reduce_chain = StuffDocumentsChain(llm_chain=reduce_chain, document_variable_name="doc_summaries")
stuff_chain = ReduceDocumentsChain(combine_documents_chain=stuff_chain)
reduce_docs_chain
# The complete map reduction chain
= MapReduceDocumentsChain(
map_reduce_chain =map_chain,
llm_chain="content",
document_variable_name=reduce_docs_chain
reduce_documents_chain
)
= split_document_by_tokens(document, 15000, 200)
splitdocs = map_reduce_chain.run(splitdocs)
summary return summary