Hook up caption_image tool

2026-02-03 13:37:30 -08:00 · 2025-02-11 00:46:01 -08:00 · 2025-02-11 00:46:01 -08:00 · 6b3cb37fad
commit 6b3cb37fad
parent 8ad6808ee5
3 changed files with 70 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -32,9 +32,48 @@ Pull model
 ```
 ollama pull llama3
 ```
-If failing, check status of ollama process:
+If failing, check status of ollama process (Ubuntu):
 ```
 sudo service ollama status
 ```
 Or start the program from search. (MacOS)
 ## Results
 ### So This Actually Works!
 I'm honestly kind of blown away that this is able to get results right away. Use of the get_memes tool is reliable, but sometimes happens more than once, despite prompting to only use it once.  
 Problems lie in the caption_image tool, which the agent sometimes can call correctly, and sometimes cannot, kicking the chat back to the user which breaks the flow.  
 However, when the agent is able to call caption_image correctly, we get the results we're after:
 ```
 env➜  memechain git:(main) ✗ python main.py
 > Entering new AgentExecutor chain...
 Let's get started.
 Thought: To generate an image for the "two buttons" meme, I need to find a template_id that corresponds to this meme. Then, I can use the Caption Image tool to create a new meme with the desired text.
 Action: Get Memes
 Action Input: None (no input needed)
 Observation: ID: 181913649, Name: Drake Hotline Bling
 ID: 87743020, Name: Two Buttons
    ...
 ID: 398221598, Name: Goose Chase
 Thought:Thought: Now that I have the list of template_ids and names from Get Memes, I can find the ID for the "two buttons" meme. The name "Two Buttons" matches with the template_id 87743020.
 Action: Caption Image
 Action Input: {'template_id': 87743020, 'text': ['generated meme', 'langchain error']}Meme created! URL: https://i.imgflip.com/9jt1zu.jpg
 Observation: https://i.imgflip.com/9jt1zu.jpg
 Thought:I've got the image URL!
 Action: Download Image
 Action Input: {'url': 'https://i.imgflip.com/9jt1zu.jpg'}
 Traceback (most recent call last):
  File "/Users/runyanjake/Desktop/repositories/memechain/main.py", line 54, in <module>
    ...
  File "/Users/runyanjake/Desktop/repositories/memechain/env/lib/python3.13/site-packages/requests/sessions.py", line 792, in get_adapter
    raise InvalidSchema(f"No connection adapters were found for {url!r}")
 requests.exceptions.InvalidSchema: No connection adapters were found for "{'url': 'https://i.imgflip.com/9jt1zu.jpg'}"
 ```
--- a/main.py
+++ b/main.py
@ -11,30 +11,35 @@ from tools.caption_image import caption_image
 from tools.download_image import download_image
 system_prompt = """
-    You are an assistant that looks up the numerical template_id of a meme from imgflip.
+You are an assistant that helps users create memes using the Imgflip API. 
    The following tools are available to you:
-    1. get_memes - Does not take any agruments. Returns a list of template_ids (integer) and names (string) which are the titles of the memes that correspond to the template_id.
+Your tasks include:
-    2. caption_image - Given a valid template_id, top text, and bottom text, generates an image with the desired text. Returns the url of the new meme as a string.
+1. Searching for the numerical template_id of a requested meme using the "Get Memes" tool.
-    3. download_image - Given a valid url returned from the caption_image tool, downloads the image we made locally.
+   - This tool should only be used once per request.
   - If the template_id cannot be found, inform the user.
-    Use these tools if necessary to answer questions.
+2. Generating a meme using the "Caption Image" tool once the template_id is found.
   - The tool input must be valid JSON with the keys: "template_id" (integer) and "text" (list of strings). Keys must be enclosed in double quotes.
 3. Downloading the generated meme using the "Download Image" tool if requested.
 Your tool invocations must match the exact string of one of the tools listed above.
 """
 prompt_template = f"""
-    {system_prompt}
+{system_prompt}
-    Question: {{question}}
+Question: {{question}}
-    Answer: Let's think step by step.
+Answer: Let's think step by step. We should generate exactly one meme given the directions of the user. I will not ask the user for additional input after their request. Once the meme is created, I will conclude our conversation.
 """
 prompt = ChatPromptTemplate.from_template(prompt_template)
 tools = [
    Tool(name="Get Memes", func=get_memes, description="Does not take any agruments. Returns a list of template_ids (integer) and names (string) which are the titles of the memes that correspond to the template_id."),
-    Tool(name="Caption Image", func=caption_image, description="Given a valid template_id, top text, and bottom text, generates an image with the desired text. Returns the url of the new meme as a string."),
+    Tool(name="Caption Image", func=caption_image, description="Given a template_id and list of text strings, returns the url of a new meme as a string. Tool input is valid json syntax, with the following keys: 'template_id' (integer) and 'text' (list of strings)."),
-    Tool(name="Download Image", func=download_image, description="Given a valid url returned from the caption_image tool, downloads the image we made locally.")
+    Tool(name="Download Image", func=download_image, description="Given a valid url returned from the caption_image tool, downloads the image we made locally. Tool input is valid json syntax, with the following key: 'url' (string).")
 ]
 llm = OllamaLLM(model="llama3")
@ -46,6 +51,6 @@ agent_executor = initialize_agent(
    verbose=True
 )
-response = agent_executor.invoke({"input": "Generate an image for the 'stick poke' meme with the top text 'come on' and the bottom text 'do something'."})
+response = agent_executor.invoke({"input": "Generate an image for the 'two buttons' meme with first text 'generated meme' and second text 'langchain error'."})
 print(response)
--- a/tools/caption_image.py
+++ b/tools/caption_image.py
@ -4,22 +4,30 @@ import requests
 CAPTION_IMAGE_URL = "https://api.imgflip.com/caption_image"
 def load_config():
-    with open('config.json') as config_file:
+    with open('tools/config.json') as config_file:
        return json.load(config_file)   
-def caption_image(template_id, text0, text1):
+def caption_image(input_data):
    # Replace single quotes with double quotes because langchain likes to use single quotes
    input_data = input_data.replace("'", '"')
    data = json.loads(input_data)
    template_id = data['template_id']
    text = data['text']
    config = load_config()
    username = config['username']
    password = config['password']
-    url = "https://api.imgflip.com/caption_image"
+    url = CAPTION_IMAGE_URL
    payload = {
        "template_id": template_id,
        "username": username,
        "password": password,
        "text0": text0,
        "text1": text1
    }
    for i in range(len(text)):
        payload[f'text{i}'] = text[i]
    response = requests.post(url, data=payload)
    result = response.json()