Formatting Function Examples

DPO Formatting

from typing import List, Dict, Any, Union, Optional


def format_record(
    fields: Dict[str, str],
    responses: List[Dict],
    unified_responses: Optional[Dict[str, Any]],
) -> Union[Dict[str, str], List[Dict[str, str]], None]:
    """
    Formats the given record data for Direct Preference Optimization (DPO),
        focusing on selecting responses based on preference rankings.

    :param fields: A dictionary mapping field names to their values for a specific record.
        Example: {
            "prompt": "What is gravity?",
            "response_1": "Gravity is a force attracting two bodies towards each other.",
            "response_2": "I have no idea about gravity."
        }

    :param responses: A list of dictionaries, each mapping a question name to its corresponding response.
        Currently not utilized in this function,
        but included for future enhancement possibilities and structural consistency.
        Example: [
            {"question_ranking": ["response_1", "response_2"]},
            ...
        ]

    :param unified_responses: An optional dictionary for non-TEXT question types (LABEL, MULTI-LABEL, RANKING, RATING),
        mapping question names to their unified responses.
        Example: {
            "question_ranking": ["response_1", "response_2"]
        }

    :return: If the required data is available and valid, returns a dictionary or list of dictionary,
        based on the target output format.
        Example return value: {
            "prompt": "What is gravity?",
            "chosen": "Gravity is a force attracting two bodies towards each other.",
            "rejected": "I have no idea about gravity."
        }
    """

    # Customize these variables based on your dataset schema.
    _prompt_field_name: str = "prompt"
    _ranking_question_name: str = "ranking_question"

    # Verify the availability of ranking data.
    ranking_response: Optional[List[str]] = unified_responses.get(
        _ranking_question_name
    )
    if not ranking_response or len(ranking_response) < 2:
        return None

    # Extract names for the chosen and rejected response fields.
    chosen_response_field_name: str = ranking_response[0]
    rejected_response_field_name: str = ranking_response[1]

    # Retrieve the data for the prompt, chosen, and rejected responses.
    prompt: Optional[str] = fields.get(_prompt_field_name)
    chosen: Optional[str] = fields.get(chosen_response_field_name)
    rejected: Optional[str] = fields.get(rejected_response_field_name)

    # Ensure all required data is present before returning.
    if not all([prompt, chosen, rejected]):
        return None

    return {"prompt": prompt, "chosen": chosen, "rejected": rejected}

Question Answering Formatting

from typing import List, Dict, Any, Union, Optional


def format_record(
    fields: Dict[str, str],
    responses: List[Dict[str, Any]],
    unified_responses: Optional[Dict[str, Any]],
) -> Union[List[Dict[str, str]], None]:
    """
    Formats the given record data for a Question Answering (QA) task, organizing questions,
    their contexts, and the corresponding answers.

    :param fields: A dictionary containing the context and question for a specific record.
        Example: {
            "context": "Gravity is a natural phenomenon by which all things with mass or energy...",
            "question": "What is gravity?"
        }

    :param responses: A list of dictionaries, each representing an answer to the question.
        Each answer dictionary contains the answer's text and potentially additional metadata.
        Example: [
            {"answer": "a natural phenomenon by which all things with mass or energy are brought..."},
            {"answer": "the force that attracts a body toward the center of the earth"},
            ...
        ]

    :param unified_responses: An optional dictionary for non-TEXT question types (LABEL, MULTI-LABEL, RANKING, RATING),
        mapping question names to their unified responses. This parameter is not utilized in the current function implementation
        but is included for future compatibility with extended data structures or additional use cases.

    :return: If the required data is available and valid, returns a list of dictionaries, each containing the 'context',
    'question', and 'answer'. If essential data is missing, returns None.
        Example return value: [
            {
                "context": "Gravity is a natural phenomenon by which all things with mass or energy...",
                "question": "What is gravity?",
                "answer": "a natural phenomenon by which all things with mass or energy are brought..."
            },
            ...
        ]
    """

    _context_field_name: str = "context"
    _question_field_name: str = "question"
    _answer_question_name: str = "answer"

    context: Optional[str] = fields.get(_context_field_name)
    question: Optional[str] = fields.get(_question_field_name)

    # Ensure both question and context are present
    if not question or not context:
        return None

    formatted_record = []
    for response in responses:
        answer_text = response.get(_answer_question_name, None)
        # Ensure the answer text is present before adding to the list
        if answer_text:
            formatted_record.append(
                {"context": context, "question": question, "answer": answer_text}
            )

    if not formatted_record:
        return None

    return formatted_record