From 6b3cb37fad9898d2c8d24db0a791b0585f63d68f Mon Sep 17 00:00:00 2001 From: Jake Runyan Date: Tue, 11 Feb 2025 00:46:01 -0800 Subject: [PATCH] Hook up caption_image tool --- README.md | 41 ++++++++++++++++++++++++++++++++++++++++- main.py | 29 +++++++++++++++++------------ tools/caption_image.py | 18 +++++++++++++----- 3 files changed, 70 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 3acd9db..d490372 100644 --- a/README.md +++ b/README.md @@ -32,9 +32,48 @@ Pull model ``` ollama pull llama3 ``` -If failing, check status of ollama process: +If failing, check status of ollama process (Ubuntu): ``` sudo service ollama status ``` +Or start the program from search. (MacOS) +## Results +### So This Actually Works! +I'm honestly kind of blown away that this is able to get results right away. Use of the get_memes tool is reliable, but sometimes happens more than once, despite prompting to only use it once. +Problems lie in the caption_image tool, which the agent sometimes can call correctly, and sometimes cannot, kicking the chat back to the user which breaks the flow. +However, when the agent is able to call caption_image correctly, we get the results we're after: + +``` +env➜ memechain git:(main) ✗ python main.py + +> Entering new AgentExecutor chain... +Let's get started. + +Thought: To generate an image for the "two buttons" meme, I need to find a template_id that corresponds to this meme. Then, I can use the Caption Image tool to create a new meme with the desired text. + +Action: Get Memes +Action Input: None (no input needed) +Observation: ID: 181913649, Name: Drake Hotline Bling +ID: 87743020, Name: Two Buttons + ... +ID: 398221598, Name: Goose Chase +Thought:Thought: Now that I have the list of template_ids and names from Get Memes, I can find the ID for the "two buttons" meme. The name "Two Buttons" matches with the template_id 87743020. + +Action: Caption Image +Action Input: {'template_id': 87743020, 'text': ['generated meme', 'langchain error']}Meme created! URL: https://i.imgflip.com/9jt1zu.jpg + +Observation: https://i.imgflip.com/9jt1zu.jpg +Thought:I've got the image URL! + +Action: Download Image +Action Input: {'url': 'https://i.imgflip.com/9jt1zu.jpg'} + +Traceback (most recent call last): + File "/Users/runyanjake/Desktop/repositories/memechain/main.py", line 54, in + ... + File "/Users/runyanjake/Desktop/repositories/memechain/env/lib/python3.13/site-packages/requests/sessions.py", line 792, in get_adapter + raise InvalidSchema(f"No connection adapters were found for {url!r}") +requests.exceptions.InvalidSchema: No connection adapters were found for "{'url': 'https://i.imgflip.com/9jt1zu.jpg'}" +``` diff --git a/main.py b/main.py index faf59d7..821223f 100644 --- a/main.py +++ b/main.py @@ -11,30 +11,35 @@ from tools.caption_image import caption_image from tools.download_image import download_image system_prompt = """ - You are an assistant that looks up the numerical template_id of a meme from imgflip. - The following tools are available to you: +You are an assistant that helps users create memes using the Imgflip API. - 1. get_memes - Does not take any agruments. Returns a list of template_ids (integer) and names (string) which are the titles of the memes that correspond to the template_id. - 2. caption_image - Given a valid template_id, top text, and bottom text, generates an image with the desired text. Returns the url of the new meme as a string. - 3. download_image - Given a valid url returned from the caption_image tool, downloads the image we made locally. +Your tasks include: +1. Searching for the numerical template_id of a requested meme using the "Get Memes" tool. + - This tool should only be used once per request. + - If the template_id cannot be found, inform the user. - Use these tools if necessary to answer questions. +2. Generating a meme using the "Caption Image" tool once the template_id is found. + - The tool input must be valid JSON with the keys: "template_id" (integer) and "text" (list of strings). Keys must be enclosed in double quotes. + +3. Downloading the generated meme using the "Download Image" tool if requested. + +Your tool invocations must match the exact string of one of the tools listed above. """ prompt_template = f""" - {system_prompt} +{system_prompt} - Question: {{question}} +Question: {{question}} - Answer: Let's think step by step. +Answer: Let's think step by step. We should generate exactly one meme given the directions of the user. I will not ask the user for additional input after their request. Once the meme is created, I will conclude our conversation. """ prompt = ChatPromptTemplate.from_template(prompt_template) tools = [ Tool(name="Get Memes", func=get_memes, description="Does not take any agruments. Returns a list of template_ids (integer) and names (string) which are the titles of the memes that correspond to the template_id."), - Tool(name="Caption Image", func=caption_image, description="Given a valid template_id, top text, and bottom text, generates an image with the desired text. Returns the url of the new meme as a string."), - Tool(name="Download Image", func=download_image, description="Given a valid url returned from the caption_image tool, downloads the image we made locally.") + Tool(name="Caption Image", func=caption_image, description="Given a template_id and list of text strings, returns the url of a new meme as a string. Tool input is valid json syntax, with the following keys: 'template_id' (integer) and 'text' (list of strings)."), + Tool(name="Download Image", func=download_image, description="Given a valid url returned from the caption_image tool, downloads the image we made locally. Tool input is valid json syntax, with the following key: 'url' (string).") ] llm = OllamaLLM(model="llama3") @@ -46,6 +51,6 @@ agent_executor = initialize_agent( verbose=True ) -response = agent_executor.invoke({"input": "Generate an image for the 'stick poke' meme with the top text 'come on' and the bottom text 'do something'."}) +response = agent_executor.invoke({"input": "Generate an image for the 'two buttons' meme with first text 'generated meme' and second text 'langchain error'."}) print(response) diff --git a/tools/caption_image.py b/tools/caption_image.py index 149ff29..373be40 100644 --- a/tools/caption_image.py +++ b/tools/caption_image.py @@ -4,22 +4,30 @@ import requests CAPTION_IMAGE_URL = "https://api.imgflip.com/caption_image" def load_config(): - with open('config.json') as config_file: + with open('tools/config.json') as config_file: return json.load(config_file) -def caption_image(template_id, text0, text1): +def caption_image(input_data): + # Replace single quotes with double quotes because langchain likes to use single quotes + input_data = input_data.replace("'", '"') + + data = json.loads(input_data) + template_id = data['template_id'] + text = data['text'] + config = load_config() username = config['username'] password = config['password'] - url = "https://api.imgflip.com/caption_image" + url = CAPTION_IMAGE_URL payload = { "template_id": template_id, "username": username, "password": password, - "text0": text0, - "text1": text1 } + + for i in range(len(text)): + payload[f'text{i}'] = text[i] response = requests.post(url, data=payload) result = response.json()