Hook up caption_image tool

2025-12-15 23:47:29 -08:00 · 2025-02-11 00:46:01 -08:00 · 2025-02-11 00:46:01 -08:00 · 6b3cb37fad
commit 6b3cb37fad
parent 8ad6808ee5
3 changed files with 70 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -32,9 +32,48 @@ Pull model
 ```
 ollama pull llama3
 ```
-If failing, check status of ollama process:
+If failing, check status of ollama process (Ubuntu):
 ```
 sudo service ollama status
 ```
+Or start the program from search. (MacOS)

+## Results

+### So This Actually Works!
+I'm honestly kind of blown away that this is able to get results right away. Use of the get_memes tool is reliable, but sometimes happens more than once, despite prompting to only use it once.  
+Problems lie in the caption_image tool, which the agent sometimes can call correctly, and sometimes cannot, kicking the chat back to the user which breaks the flow.  
+However, when the agent is able to call caption_image correctly, we get the results we're after:
+
+```
+env➜  memechain git:(main) ✗ python main.py
+
+> Entering new AgentExecutor chain...
+Let's get started.
+
+Thought: To generate an image for the "two buttons" meme, I need to find a template_id that corresponds to this meme. Then, I can use the Caption Image tool to create a new meme with the desired text.
+
+Action: Get Memes
+Action Input: None (no input needed)
+Observation: ID: 181913649, Name: Drake Hotline Bling
+ID: 87743020, Name: Two Buttons
+    ...
+ID: 398221598, Name: Goose Chase
+Thought:Thought: Now that I have the list of template_ids and names from Get Memes, I can find the ID for the "two buttons" meme. The name "Two Buttons" matches with the template_id 87743020.
+
+Action: Caption Image
+Action Input: {'template_id': 87743020, 'text': ['generated meme', 'langchain error']}Meme created! URL: https://i.imgflip.com/9jt1zu.jpg
+
+Observation: https://i.imgflip.com/9jt1zu.jpg
+Thought:I've got the image URL!
+
+Action: Download Image
+Action Input: {'url': 'https://i.imgflip.com/9jt1zu.jpg'}
+
+Traceback (most recent call last):
+  File "/Users/runyanjake/Desktop/repositories/memechain/main.py", line 54, in <module>
+    ...
+  File "/Users/runyanjake/Desktop/repositories/memechain/env/lib/python3.13/site-packages/requests/sessions.py", line 792, in get_adapter
+    raise InvalidSchema(f"No connection adapters were found for {url!r}")
+requests.exceptions.InvalidSchema: No connection adapters were found for "{'url': 'https://i.imgflip.com/9jt1zu.jpg'}"
+```
--- a/main.py
+++ b/main.py
@ -11,30 +11,35 @@ from tools.caption_image import caption_image
 from tools.download_image import download_image

 system_prompt = """
-    You are an assistant that looks up the numerical template_id of a meme from imgflip.
-    The following tools are available to you:
+You are an assistant that helps users create memes using the Imgflip API. 

-    1. get_memes - Does not take any agruments. Returns a list of template_ids (integer) and names (string) which are the titles of the memes that correspond to the template_id.
-    2. caption_image - Given a valid template_id, top text, and bottom text, generates an image with the desired text. Returns the url of the new meme as a string.
-    3. download_image - Given a valid url returned from the caption_image tool, downloads the image we made locally.
+Your tasks include:
+1. Searching for the numerical template_id of a requested meme using the "Get Memes" tool.
+   - This tool should only be used once per request.
+   - If the template_id cannot be found, inform the user.

-    Use these tools if necessary to answer questions.
+2. Generating a meme using the "Caption Image" tool once the template_id is found.
+   - The tool input must be valid JSON with the keys: "template_id" (integer) and "text" (list of strings). Keys must be enclosed in double quotes.
+
+3. Downloading the generated meme using the "Download Image" tool if requested.
+
+Your tool invocations must match the exact string of one of the tools listed above.
 """

 prompt_template = f"""
-    {system_prompt}
+{system_prompt}

-    Question: {{question}}
+Question: {{question}}

-    Answer: Let's think step by step.
+Answer: Let's think step by step. We should generate exactly one meme given the directions of the user. I will not ask the user for additional input after their request. Once the meme is created, I will conclude our conversation.
 """

 prompt = ChatPromptTemplate.from_template(prompt_template)

 tools = [
    Tool(name="Get Memes", func=get_memes, description="Does not take any agruments. Returns a list of template_ids (integer) and names (string) which are the titles of the memes that correspond to the template_id."),
-    Tool(name="Caption Image", func=caption_image, description="Given a valid template_id, top text, and bottom text, generates an image with the desired text. Returns the url of the new meme as a string."),
-    Tool(name="Download Image", func=download_image, description="Given a valid url returned from the caption_image tool, downloads the image we made locally.")
+    Tool(name="Caption Image", func=caption_image, description="Given a template_id and list of text strings, returns the url of a new meme as a string. Tool input is valid json syntax, with the following keys: 'template_id' (integer) and 'text' (list of strings)."),
+    Tool(name="Download Image", func=download_image, description="Given a valid url returned from the caption_image tool, downloads the image we made locally. Tool input is valid json syntax, with the following key: 'url' (string).")
 ]

 llm = OllamaLLM(model="llama3")
@ -46,6 +51,6 @@ agent_executor = initialize_agent(
    verbose=True
 )

-response = agent_executor.invoke({"input": "Generate an image for the 'stick poke' meme with the top text 'come on' and the bottom text 'do something'."})
+response = agent_executor.invoke({"input": "Generate an image for the 'two buttons' meme with first text 'generated meme' and second text 'langchain error'."})
 print(response)

--- a/tools/caption_image.py
+++ b/tools/caption_image.py
@ -4,23 +4,31 @@ import requests
 CAPTION_IMAGE_URL = "https://api.imgflip.com/caption_image"

 def load_config():
-    with open('config.json') as config_file:
+    with open('tools/config.json') as config_file:
        return json.load(config_file)   

-def caption_image(template_id, text0, text1):
+def caption_image(input_data):
+    # Replace single quotes with double quotes because langchain likes to use single quotes
+    input_data = input_data.replace("'", '"')
+    
+    data = json.loads(input_data)
+    template_id = data['template_id']
+    text = data['text']
+
    config = load_config()
    username = config['username']
    password = config['password']

-    url = "https://api.imgflip.com/caption_image"
+    url = CAPTION_IMAGE_URL
    payload = {
        "template_id": template_id,
        "username": username,
        "password": password,
-        "text0": text0,
-        "text1": text1
    }
    
+    for i in range(len(text)):
+        payload[f'text{i}'] = text[i]
+
    response = requests.post(url, data=payload)
    result = response.json()