Case 1. 과도한 확대
- 음식을 담은 그릇이 잘려있는 빙수(https://acceptmenu.cdn.baemin.com/acceptmenu-wmq4/i/2024/4/2/01htd8bmxrtbv1heccg2b23h2y.jpeg)
- 음식이 이미지 상단 프레임에 걸쳐서 잘려있는 돈가츠(https://acceptmenu.cdn.baemin.com/acceptmenu-wmq4/i/2024/4/2/01htd6p190ptx6jtj7nn96q2da.jpeg)
- 과도하게 확대되어 음식의 영역이 잘려있는 닭갈비(https://acceptmenu.cdn.baemin.com/acceptmenu-wmq4/i/2024/4/2/01htd6350w84ntrndyecs2jyf2.jpeg)
- 음식의 우측 영역이 잘려있는 볶음밥(https://acceptmenu.cdn.baemin.com/acceptmenu-wmq4/i/2024/4/2/01htd4jgg0zhh0k2tm6c1zwg7c.jpeg)
Case 2. 저화질
- 이미지의 전체적인 화질이 낮은 총각김치(https://acceptmenu.cdn.baemin.com/acceptmenu-wmq4/i/2024/4/2/01htdxzads84yc1a792w13bk0n.jpeg)
- 이미지의 전체적인 화질이 낮은 등갈비찜(https://acceptmenu.cdn.baemin.com/acceptmenu-wmq4/i/2024/4/2/01htdr2hmkk5dfy6t1qtkejq68.jpeg)
- 이미지의 전체적인 화질이 낮은 녹차(https://acceptmenu.cdn.baemin.com/acceptmenu-wmq4/i/2024/4/2/01htddq36cw99h706n7y6fd016.jpeg)
- 이미지의 전체적인 화질이 낮은 휘낭시에(https://acceptmenu.cdn.baemin.com/acceptmenu-wmq4/i/2024/4/2/01htdagfxr8k0kc1zbb62sx12j.jpeg)
{ "apim-request-id": "3b01cb13-7576-4004-8185-8d416fadf3ed", "content-length": "288", "content-type": "application/json; charset=utf-8", "modelVersion": "2023-10-01", "metadata": { "width": 1280, "height": 960 }, "objectsResult": { "values": [ { "boundingBox": { "x": 832, "y": 15, "w": 408, "h": 334 }, "tags": [ { "name": "Vegetable", "confidence": 0.51 } ] }, { "boundingBox": { "x": 144, "y": 75, "w": 1000, "h": 880 }, "tags": [ { "name": "Food", "confidence": 0.709 } ] } ] } }
{ "apim-request-id": "73f48582-75f6-48ab-8b5c-a14a8ceaa476", "content-length": "291", "content-type": "application/json; charset=utf-8", "modelVersion": "2023-10-01", "metadata": { "width": 3464, "height": 3464 }, "objectsResult": { "values": [ { "boundingBox": { "x": 710, "y": 133, "w": 2560, "h": 3237 }, "tags": [ { "name": "Food", "confidence": 0.795 } ] }, { "boundingBox": { "x": 609, "y": 1865, "w": 1570, "h": 1473 }, "tags": [ { "name": "Food", "confidence": 0.715 } ] } ] } }
해당 이미지는 너무 꽉 찬 음식 사진이라서 해당 이미지 근처에 여백을 늘린 후에 재시도 함.
{ "apim-request-id": "fa47f938-70f8-453a-9f2a-75c33bb924a2", "content-length": "193", "content-type": "application/json; charset=utf-8", "modelVersion": "2023-10-01", "metadata": { "width": 1262, "height": 1004 }, "objectsResult": { "values": [ { "boundingBox": { "x": 100, "y": 102, "w": 1052, "h": 767 }, "tags": [ { "name": "Food", "confidence": 0.779 } ] } ] } }
{ "apim-request-id": "2669dbd3-9f7b-469b-9605-43cdb87915e4", "content-length": "193", "content-type": "application/json; charset=utf-8", "modelVersion": "2023-10-01", "metadata": { "width": 1280, "height": 2276 }, "objectsResult": { "values": [ { "boundingBox": { "x": 78, "y": 584, "w": 1195, "h": 1235 }, "tags": [ { "name": "Food", "confidence": 0.745 } ] } ] } }
테스트 결과
import json from promptflow import tool from promptflow.client import PFClient from azure.ai.vision.imageanalysis import ImageAnalysisClient from azure.ai.vision.imageanalysis.models import VisualFeatures from azure.core.credentials import AzureKeyCredential # The inputs section will change based on the arguments of the tool function, after you save the code # Adding type to arguments and return value will help the system show the types properly # Please update the function name/signature per need @tool def image_processing(image_url: str) -> str: # set environment variables: pf_client = PFClient() pf_connection = pf_client.connections.get(name="object-detection") try: endpoint = pf_connection.endpoint key = pf_connection.key except KeyError: print("Missing environment variable 'VISION_ENDPOINT' or 'VISION_KEY'") print("Set them before running this sample.") exit() # Create an Image Analysis client for synchronous operations client = ImageAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key) ) # Get a caption for the image. This will be a synchronously (blocking) call. result = client.analyze_from_url( image_url=image_url, visual_features=[VisualFeatures.OBJECTS] ).as_dict() print(result) image_height = result["metadata"]["height"] image_width = result["metadata"]["width"] food_total = 0 response_data = { "image_url": image_url, "image_height": image_height, "image_width": image_width, "is_good_food_image": True, "food_total": 0, "food_objects": [], "message": "" } detected_position_total = [] # initialize an list dict to store the detected objects if result["objectsResult"] is not None: for object in result["objectsResult"]["values"]: # 함수 호출 is_food, detected_position = position_image_food_overlap(object, image_width, image_height, 0.05) if is_food == True: response_data["food_objects"].append(object) food_total += 1 if len(detected_position) > 0: # detected_position_total에 detected_position을 추가 detected_position_total.extend(detected_position) response_data["food_total"] = food_total if (food_total == 0): response_data["message"] = "음식 이미지가 존재하지 않거나 너무 확대되어진 이미지입니다." response_data["is_good_food_image"] = False else: # detected_position_total에 중복된 값이 있을 경우 중복 제거 detected_position = list(set(detected_position_total)) detected_position_str = ", ".join(detected_position) if len(detected_position) > 0: response_data["is_good_food_image"] = False response_data["message"] = f"음식 이미지가 {detected_position_str} 프레임에 너무 가깝게 위치하고 있습니다." else: response_data["message"] = "음식 이미지가 정상적으로 존재합니다." return response_data # If the name of a specific object is food based on the image_witdh and image_height of the image, and it is located less than 10% closer to the side of the original image based on the x, y, w, and h coordinates of the rectangle, then Function that returns location def position_image_food_overlap(object: dict, image_width: int, image_height: int, ratio: float) -> str: # initialize an list dict to store the detected objects is_food = False detected_position = [] object_area = image_width * image_height if object["tags"][0]["name"] == "Food": is_food = True # 좌측 끝에 위치한 객체인지 확인 if object["boundingBox"]["x"] < image_width * ratio: detected_position.append("좌측") # 우측 끝에 위치한 객체인지 확인 if object["boundingBox"]["x"] + object["boundingBox"]["w"] > image_width * (1-ratio): detected_position.append("우측") # 상단 끝에 위치한 객체인지 확인 if object["boundingBox"]["y"] < image_height * ratio: detected_position.append("상단") # 하단 끝에 위치한 객체인지 확인 if object["boundingBox"]["y"] + object["boundingBox"]["h"] > image_height * (1-ratio): print(object["boundingBox"]["y"] + object["boundingBox"]["h"]) print(image_height * (1-ratio)) detected_position.append("하단") return is_food, detected_position
import json from promptflow import tool from promptflow.client import PFClient from azure.core.credentials import AzureKeyCredential from openai import AzureOpenAI # The inputs section will change based on the arguments of the tool function, after you save the code # Adding type to arguments and return value will help the system show the types properly # Please update the function name/signature per need @tool def gpt_4_vision_analysis(input_url: str, question: str) -> str: # set environment variables: pf_client = PFClient() pf_connection = pf_client.connections.get(name="aoai_australia") try: endpoint = pf_connection.api_base key = "118a4d12a6f04dc2a4e1916770e28b5b" except KeyError: print("Missing environment variable 'VISION_ENDPOINT' or 'VISION_KEY'") print("Set them before running this sample.") exit() api_base = endpoint api_key= key deployment_name = 'gpt-4-vision' api_version = '2023-12-01-preview' # this might change in the future client = AzureOpenAI( api_key=api_key, api_version=api_version, base_url=f"{api_base}/openai/deployments/{deployment_name}" ) response = client.chat.completions.create( model=deployment_name, messages=[ { "role": "system", "content": "As an AI assistant, your task involves interpreting about the food image. Remember to provide accurate answers based on the information present in the image." }, { "role": "user", "content": [ { "type": "text", "text": "이 이미지애 대해서 설명해 주세요.:" }, { "type": "image_url", "image_url": { "url": input_url } } ] } ], temperature=0.1, top_p=0.95, max_tokens=2000 ) return response.choices[0].message.content
# system: As an AI assistant, your task involves interpreting images and responding to questions about the image. Remember to provide accurate answers based on the information present in the image. # user: Can you tell me what the image depicts? ![image]({{image_input}})