<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Infodemiology</journal-id><journal-id journal-id-type="publisher-id">infodemiology</journal-id><journal-id journal-id-type="index">38</journal-id><journal-title>JMIR Infodemiology</journal-title><abbrev-journal-title>JMIR Infodemiology</abbrev-journal-title><issn pub-type="epub">2564-1891</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v6i1e90295</article-id><article-id pub-id-type="doi">10.2196/90295</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Japanese-Language AI Agent System for Human Papillomavirus Vaccine Infoveillance and Public Communication: Development and Feasibility Evaluation</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Junyu</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yang</surname><given-names>Siwen</given-names></name><degrees>BEng</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ma</surname><given-names>Dexiu</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Niu</surname><given-names>Qian</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Zequn</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nagai-Tanima</surname><given-names>Momoko</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Aoyama</surname><given-names>Tomoki</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Graduate School of Medicine, Kyoto University</institution><addr-line>Yoshida-honmachi, Sakyo-ku</addr-line><addr-line>Kyoto</addr-line><country>Japan</country></aff><aff id="aff2"><institution>David R. Cheriton School of Computer Science, University of Waterloo</institution><addr-line>Waterloo</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff3"><institution>Department of Computer Science, Whitacre College of Engineering, Texas Tech University</institution><addr-line>Lubbock</addr-line><addr-line>TX</addr-line><country>United States</country></aff><aff id="aff4"><institution>Graduate School of Engineering, The University of Tokyo</institution><addr-line>Bunkyo-ku</addr-line><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff5"><institution>Department of EEIS, University of Science and Technology of China</institution><addr-line>Hefei</addr-line><addr-line>Anhui</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mackey</surname><given-names>Tim</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Kim</surname><given-names>Kwanho</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Quadri</surname><given-names>Syed F</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lo</surname><given-names>Wen-Juo</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Tomoki Aoyama, MD, PhD, Graduate School of Medicine, Kyoto University, Yoshida-honmachi, Sakyo-ku, Kyoto, 606-8501, Japan, 81 075-753-7531; <email>aoyama.tomoki.4e@kyoto-u.ac.jp</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>21</day><month>5</month><year>2026</year></pub-date><volume>6</volume><elocation-id>e90295</elocation-id><history><date date-type="received"><day>25</day><month>12</month><year>2025</year></date><date date-type="rev-recd"><day>21</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>21</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Junyu Liu, Siwen Yang, Dexiu Ma, Qian Niu, Zequn Zhang, Momoko Nagai-Tanima, Tomoki Aoyama. Originally published in JMIR Infodemiology (<ext-link ext-link-type="uri" xlink:href="https://infodemiology.jmir.org">https://infodemiology.jmir.org</ext-link>), 21.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Infodemiology, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://infodemiology.jmir.org/">https://infodemiology.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://infodemiology.jmir.org/2026/1/e90295"/><abstract><sec><title>Background</title><p>Human papillomavirus (HPV) vaccine hesitancy remains a significant public health challenge in Japan, where proactive vaccination recommendations were suspended between 2013 and 2021. The resulting information gap between medical institutions and vaccine-hesitant populations is exacerbated by misinformation on social media platforms. Traditional public health communication strategies cannot address individual queries while simultaneously monitoring population-level discourse.</p></sec><sec><title>Objective</title><p>This study aimed to develop and conduct a feasibility evaluation of a dual-purpose artificial intelligence agent system that delivers verified HPV vaccine information to the public through a conversational interface while generating infoveillance reports for medical institutions based on user interactions and social media discourse.</p></sec><sec sec-type="methods"><title>Methods</title><p>We implemented a system with 3 components: a vector database integrating 139,803 documents, including academic papers, Japanese government sources, news media, and social media posts; a retrieval-augmented generation chatbot using a ReAct agent architecture with iterative multitool orchestration across 5 specialized knowledge sources; and an automated report generation system with modules for news analysis, research synthesis, social media sentiment analysis, including stance classification and topic modeling, and user interaction pattern identification. System performance was assessed using both automated and manual evaluation protocols on a scale from 0 to 5.</p></sec><sec sec-type="results"><title>Results</title><p>The entire system functioned as expected. For single-turn evaluation, the chatbot achieved mean scores of 4.83 (SD 0.67; 95% CI 4.71-4.93) for relevance, 4.89 (SD 0.53; 95% CI 4.79-4.97) for routing, 4.50 (SD 1.29; 95% CI 4.27-4.70) for reference quality, 4.90 (SD 0.62; 95% CI 4.78-4.99) for correctness, and 4.88 (SD 0.54; 95% CI 4.78-4.96) for professional identity, with an overall mean of 4.80. Multiturn evaluation yielded higher mean scores: 4.94 for context memory (SD 0.25; 95% CI 4.84-5.00) and an overall mean of 4.98, with topic centering and identity achieving 5.00. The report generation system achieved high scores across all sections: 4.83 for completeness (SD 0.37; 95% CI 4.73-4.94), 4.88 for correctness (SD 0.33; 95% CI 4.77-4.96), and 4.12 for helpfulness (SD 0.48; 95% CI 3.98-4.27). Reference validity achieved perfect scores (5.00) across all periods, with citation correctness averaging 4.21 (SD 0.58; 95% CI 3.96-4.46).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This feasibility study demonstrated that an integrated artificial intelligence agent system can support both public HPV vaccine communication and social media infoveillance in a Japanese-language context. Prospective deployment with real users is needed to assess actual public health impact.</p></sec></abstract><kwd-group><kwd>human papillomavirus</kwd><kwd>HPV</kwd><kwd>artificial intelligence agent</kwd><kwd>AI agent</kwd><kwd>large language model</kwd><kwd>stance analysis</kwd><kwd>topic modeling</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Human papillomavirus (HPV) is a significant public health concern that caused 662,044 new cervical cancer cases and 348,709 deaths worldwide in 2022 [<xref ref-type="bibr" rid="ref1">1</xref>]. HPV vaccines have demonstrated high efficacy in preventing HPV-related diseases [<xref ref-type="bibr" rid="ref2">2</xref>], and numerous countries have implemented national vaccination programs since their introduction in 2006. However, vaccine hesitancy remains a persistent challenge [<xref ref-type="bibr" rid="ref3">3</xref>], particularly in countries such as Japan, where HPV vaccination rates have dropped dramatically owing to safety concerns and media coverage of alleged adverse events [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>The spread of misinformation regarding HPV vaccines through social media platforms has exacerbated public concern [<xref ref-type="bibr" rid="ref5">5</xref>], creating a complex information landscape in which accurate medical information competes with anecdotal reports and unverified claims. Traditional public health communication strategies face significant challenges in addressing health misinformation at scale because responding effectively requires simultaneously countering individual-level psychological barriers and monitoring population-level misinformation dynamics across diverse platforms [<xref ref-type="bibr" rid="ref6">6</xref>]. Medical institutions require timely insights into public discourse to develop effective communication strategies; however, manual analysis of vast amounts of social media data and public inquiries is resource-intensive and time-consuming.</p><p>Recent advances in large language model (LLM) and retrieval-augmented generation (RAG) systems offer promising solutions to bridge this information gap [<xref ref-type="bibr" rid="ref7">7</xref>]. LLMs demonstrate remarkable capabilities for natural language understanding and generation across multiple languages [<xref ref-type="bibr" rid="ref8">8</xref>], including Japanese, which presents unique challenges owing to its complex writing system and grammatical structure [<xref ref-type="bibr" rid="ref9">9</xref>]. RAG systems combine the generative capabilities of LLMs with retrieval from curated knowledge bases, enabling responses grounded in verified information sources while maintaining conversational fluency.</p><p>Previous studies have applied natural language processing to HPV-related social media analysis, primarily focusing on sentiment analysis and topic modeling [<xref ref-type="bibr" rid="ref10">10</xref>]. However, these approaches typically operate as passive analytical tools instead of as active information dissemination systems. Chatbot systems for health information have been developed for various domains [<xref ref-type="bibr" rid="ref11">11</xref>]; however, few integrate multisource retrieval from academic literature, official guidelines, news media, and social media discourse while simultaneously providing bidirectional communication between the public and health institutions.</p><p>The Japanese context poses unique challenges to and opportunities for such systems. Japan experienced a dramatic suspension of proactive HPV vaccination recommendations from 2013 to 2021 because of safety concerns, resulting in vaccination rates falling below 1% and creating a substantial gap in population immunity [<xref ref-type="bibr" rid="ref4">4</xref>]. The government&#x2019;s 2022 resumption of vaccination recommendations necessitates renewed public education efforts [<xref ref-type="bibr" rid="ref12">12</xref>]. Furthermore, Japanese-language health information systems face technical challenges, including multiscript processing (hiragana, katakana, and kanji), medical terminology localization, and culturally appropriate communication styles.</p><p>In this study, we developed and implemented a comprehensive artificial intelligence (AI) agent system designed to address both public information needs and institutional monitoring requirements for the HPV vaccine discourse in Japan. Our system has two main features: (1) a RAG-based chatbot that answers public queries by retrieving and synthesizing information from academic papers, official documents, news articles, and social media posts; and (2) an analytics dashboard that generates reports for medical institutions based on aggregated chat histories and social media data. The system uses multisource data collection, semantic search with vector embeddings, intelligent query routing, and automated evaluation frameworks.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>System Architecture</title><p>We developed a multicomponent AI agent system for HPV vaccine information dissemination and public opinion analysis. The system comprises 3 main modules: a multisource data collection and storage system, a ReAct agent&#x2013;based chatbot [<xref ref-type="bibr" rid="ref13">13</xref>] for public information queries, and a report generation system for medical institutions.</p><p>The overall architecture follows a distributed design pattern with a centralized vector database (Qdrant) [<xref ref-type="bibr" rid="ref14">14</xref>] serving as a knowledge repository (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Data flow from multiple external sources through specialized collectors in the database, where they are indexed using semantic embeddings. The chatbot and report generation modules both query this database but serve different end users with distinct interfaces and functionalities. The system implements a bidirectional information flow. The chatbot provides HPV vaccine information to the public while simultaneously collecting user inquiries with consent, and the report generator aggregates these interactions with social media data to produce actionable insights for medical institutions.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overall system architecture showing the integration of the data collection, vector database, chatbot interface, and report generation components. HPV: human papillomavirus.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="infodemiology_v6i1e90295_fig01.png"/></fig></sec><sec id="s2-2"><title>Data Collection and Database</title><p>We implemented a vector database infrastructure as the central knowledge repository managing 4 distinct collections: academic papers, official documents, social media posts, and chat histories. Each document was represented as a 2048D vector using embedding models optimized for Japanese-language processing (PLaMo-Embedding-1B; Preferred Networks, Inc) [<xref ref-type="bibr" rid="ref15">15</xref>]. This database uses cosine similarity metrics for semantic search operations [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>], supporting efficient retrieval with customizable parameters and metadata preservation.</p><p>Data were collected from 4 heterogeneous sources to construct a comprehensive knowledge base spanning scientific evidence, official guidance, media coverage, and public discourse. Academic papers were retrieved from PubMed [<xref ref-type="bibr" rid="ref18">18</xref>] through keyword-based searches with temporal filtering, capturing abstracts, MeSH (Medical Subject Headings) terms, journal information, and DOIs. Official documents and web content were collected from authoritative sources, including the World Health Organization [<xref ref-type="bibr" rid="ref19">19</xref>] and the Japanese Ministry of Health, Labor, and Welfare (MHLW) [<xref ref-type="bibr" rid="ref20">20</xref>] through multiple complementary methods: intelligent query analysis for information synthesis, filtered web searches targeting official sources, online PDF document discovery and extraction, and specialized scraping of government meeting records and reference materials. News articles were aggregated from multiple news sources using keyword-based searches in Japanese and English, and deduplication was used to ensure unique coverage. Social media data from X (formerly known as Twitter) were collected through daily automated harvesting using Tweepy [<xref ref-type="bibr" rid="ref21">21</xref>] with temporal specifications. Rate limit handling was implemented to ensure comprehensive data capture across extended periods.</p></sec><sec id="s2-3"><title>Chatbot Implementation</title><p>We implemented a ReAct agent&#x2013;based chatbot using LlamaIndex [<xref ref-type="bibr" rid="ref22">22</xref>] through an iterative multitool orchestration architecture in which a single intelligent controller dynamically selects and combines information from multiple specialized data sources across sequential decision-making iterations. The system addresses the challenge of answering diverse user queries by enabling flexible, multisource information gathering while maintaining conversational coherence and citation quality assurance.</p><sec id="s2-3-1"><title>Architecture</title><p>The chatbot uses a single controller agent with 5 specialized tools: papers (academic literature), the web (official documents and guidelines), social media (public discourse), news (media coverage), and chitchat (casual conversations). A citation validation tool ensures the response quality. Unlike conventional routing architectures, this design enables the controller to select and combine multiple tools iteratively for a single query, synthesizing information across heterogeneous sources.</p><p>Each tool performs a semantic similarity search against its respective vector database collection, retrieving the relevant documents that the controller assembles into responses using the proper source attribution. The controller analyzes queries in the conversation context, determines appropriate information sources, and iteratively gathers evidence until it is sufficient for comprehensive response synthesis. A web-based Streamlit interface (Snowflake Inc) [<xref ref-type="bibr" rid="ref23">23</xref>] presents conversations with integrated citations, whereas tool use metadata are stored with user consent to inform institutional reporting.</p></sec><sec id="s2-3-2"><title>Operational Workflow</title><p>The query processing follows an iterative orchestration loop (<xref ref-type="fig" rid="figure2">Figure 2</xref>). Upon receiving a user message (query), the controller examines the question along with the recent conversation history to assess the information requirements. The controller then enters a decision cycle: (1) analyze information gaps, (2) select the most appropriate tool, (3) retrieve results via a semantic similarity search, (4) review relevance, and (5) determine whether sufficient evidence exists or additional retrieval is required. This process continues until comprehensive information is gathered for response generation.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Chatbot operational workflow showing the iterative ReAct agent architecture. The user query flows through reasoning and tool selection, with the controller dynamically selecting from 5 specialized tools (papers, the web, social media, news, and chitchat). Results are observed and validated through a citation validation mechanism before generating the final response with proper source attribution.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="infodemiology_v6i1e90295_fig02.png"/></fig><p>The system generates responses using inline citation markers corresponding to the retrieved documents, enabling users to trace claims to their original sources. A 2-level citation validation mechanism ensures quality: individual tools validate their own citations, and a dedicated validation tool examines the entire response for citation completeness before delivery.</p><p>Privacy protection is implemented for social media queries, synthesizing themes and sentiment patterns without attributing statements to individual users. Stateful conversation management maintains dialogue context through a windowed history approach, enabling interpretation of follow-up questions with implicit references (eg, &#x201C;What about side effects?&#x201D; following a vaccine efficacy discussion) while maintaining topical continuity.</p></sec></sec><sec id="s2-4"><title>Report Generation System</title><p>We developed an automated report generation system that synthesizes data from multiple sources to produce comprehensive PDF reports for medical institutions and policymakers. This system uses LLMs for intelligent analysis and generates professional documents with academic-style citations, visualizations, and actionable insights.</p><sec id="s2-4-1"><title>System Architecture</title><p>The report generation system (<xref ref-type="fig" rid="figure3">Figure 3</xref>) comprises four specialized analysis modules coordinated by a central orchestrator: (1) news analyzer for recent news, (2) paper analyzer for recent academic research, (3) social media analyzer for public sentiment analysis, and (4) chat analyzer for user interaction pattern identification. Each module queries the vector database for documents within a configurable time window, performs domain-specific analysis using an LLM-based inference, and generates a structured output with properly formatted citations. The orchestrator coordinates module execution, manages data flow between components, aggregates results, and assembles the final PDF document with bilingual support (Japanese and English).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Report generation system architecture. Data flow from external news sources and vector database collections (papers, social media, and user conversations) through specialized analysis modules. The social media analyzer performs topic modeling, stance detection, and misinformation detection. All analysis results are integrated through cross-source aggregation before final report generation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="infodemiology_v6i1e90295_fig03.png"/></fig></sec><sec id="s2-4-2"><title>Social Media Analysis</title><p>Social media platforms have emerged as critical channels for public health discourse, serving as real-time indicators of population-level attitudes toward vaccination [<xref ref-type="bibr" rid="ref24">24</xref>]. For medical institutions and policymakers, a systematic analysis of social media content provides valuable insights into public concerns, emerging misinformation narratives, and temporal shifts in vaccine sentiment [<xref ref-type="bibr" rid="ref25">25</xref>].</p><p>The social media analyzer performs a multidimensional public opinion assessment through 4 complementary analytical processes. Stance classification categorizes each post as supportive, opposed, neutral, or unclear regarding HPV vaccination using batch LLM inference with a temporal context, thereby aggregating daily counts to track sentiment evolution. Topic modeling uses a hybrid approach that combines statistical latent Dirichlet allocation [<xref ref-type="bibr" rid="ref26">26</xref>] with LLM-based semantic interpretation in which Japanese text undergoes morphological analysis [<xref ref-type="bibr" rid="ref27">27</xref>], term frequency&#x2013;inverse document frequency weighting [<xref ref-type="bibr" rid="ref28">28</xref>], and model training via Gensim (RARE Technologies Ltd) [<xref ref-type="bibr" rid="ref29">29</xref>] to extract interpretable topic labels from keyword distributions. Misinformation detection uses LLM-based analysis to identify posts containing claims that contradict the established scientific consensus, categorizing detected content by type (safety concerns, efficacy doubts, and conspiracy theories) for institutional awareness. Visualization generation produces temporal trend graphics and thematic distribution charts that are embedded directly in the reports to enhance interpretability for nontechnical stakeholders.</p></sec><sec id="s2-4-3"><title>PDF Report Assembly</title><p>The report generator produces professional bilingual documents (in Japanese and English) structured into five main sections: (1) news trends, presenting influential media coverage with relevance assessments; (2) research progress, synthesizing recent academic literature; (3) social media analysis, containing sentiment trends, topic distributions, and visualizations; (4) chat analysis, identifying user information needs and knowledge gaps; and (5) overall summary, providing an executive synthesis across all data sources.</p><p>Each section includes inline citations with source-appropriate formatting, which enables medical institutions to verify information and assess evidence quality independently. The orchestrator synthesizes the findings from all the analysis modules into an executive summary that provides a comprehensive overview of the reporting period. This multisource integration approach captures diverse perspectives, enabling stakeholders to develop informed HPV vaccination communication strategies and policy interventions.</p></sec></sec><sec id="s2-5"><title>Evaluation Framework</title><p>We developed a multifaceted evaluation framework comprising complementary assessment protocols for chatbot performance and automated report generation. The framework uses an LLM-based evaluation for scalable assessment and human expert validation for quality assurance.</p><sec id="s2-5-1"><title>Chatbot Evaluation Methodology</title><p>The chatbot evaluation framework assesses system performance through 2 complementary protocols: single-turn evaluation for individual question-answer exchanges and multiturn evaluation for complete conversation quality. Both protocols use LLM-based judges who receive conversation context, tool use information, and scoring rubrics, generating scores on a scale from 0 to 5 (details are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref> and <xref ref-type="supplementary-material" rid="app2">2</xref>) for each dimension, along with written evaluation notes.</p><p>To collect test data, 3 volunteers simulated diverse user personas (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>) to create realistic conversations. They posed questions across different personas and topics, and multiturn conversations with the production chatbot system were conducted and stored for subsequent evaluations.</p><p>Using the test data, a single-turn evaluation assesses 5 dimensions: relevance, measuring whether the response addressed the question; routing, evaluating the appropriate tool selection for the query type; reference, assessing citation validity and proper source attribution; correctness, verifying factual accuracy against established guidelines; and identity, examining professional medical communication tone. Multiturn evaluation extends these 5 dimensions through 2 additional metrics for conversational coherence: context memory, which assesses the appropriate use of information from previous turns, and topic centering, which evaluates natural conversation flow with logical transitions between related topics.</p><p>We randomly selected 20 question-answer pairs for manual scoring by the 3 domain experts to validate the reliability of the automated evaluation. The correlation between the expert and LLM-generated scores was analyzed to assess whether the automated metrics accurately reflected human judgment.</p></sec><sec id="s2-5-2"><title>Report Generation Evaluation Methodology</title><p>Report quality assessment uses 2 complementary evaluation protocols: main text evaluation for content quality and reference evaluation for citation validity. Both protocols use LLM-based judges with standardized scoring rubrics, generating scores on a scale from 0 to 5 (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendices 4</xref> and <xref ref-type="supplementary-material" rid="app5">5</xref>) for each dimension. The main text evaluation assesses 3 dimensions: completeness, measuring structural integrity and whether sections contain well-developed content; correctness, evaluating factual accuracy and proper interpretation of source materials; and helpfulness, examining practical utility and actionable insights for institutional stakeholders. Reference evaluation validates citation quality across 2 dimensions: reference validity, measuring the proportion of cited sources that are accessible and exist in the underlying database, and citation correctness, assessing whether citations properly support the claims made in the report text.</p><p>Temporal analysis generates reports for multiple periods to assess system robustness across varying conditions. This approach evaluates both the system consistency and the ability to capture temporal variations in public discourse. Three volunteers read and scored each report independently.</p></sec></sec><sec id="s2-6"><title>Ethical Considerations</title><p>The knowledge base was constructed exclusively from publicly available data: PubMed abstracts, World Health Organization and Japanese MHLW documents, news articles, and public X posts. Social media data were analyzed only in aggregate; no individual users were identified or quoted.</p><p>During chatbot evaluation, 3 volunteer members of the research team generated simulated conversations with informed consent. No personally identifiable information or personal health data were collected. Conversation data and metadata (time stamps and tool use logs) were stored in a secured database accessible only to the research team.</p><p>This study was determined to not require institutional review board review in accordance with the Ethical Guidelines for Medical and Biological Research Involving Human Subjects (Ministry of Education, Culture, Sports, Science, and Technology; MHLW; and Ministry of Economy, Trade, and Industry, 2021) [<xref ref-type="bibr" rid="ref30">30</xref>] because (1) all external data were publicly available and analyzed without individual identification; (2) evaluation participants were consenting research team members, not patients or vulnerable populations; and (3) no clinical interventions or health outcomes were measured.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Database</title><p>This database integrates 139,803 documents across 4 specialized collections (<xref ref-type="table" rid="table1">Table 1</xref>). This document volume and distribution enables a realistic simulation of operational use cases.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Database collection statistics.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Collection</td><td align="left" valign="bottom">Documents (n=139,803), n (%)</td><td align="left" valign="bottom">Unique source</td><td align="left" valign="bottom">Primary language</td></tr></thead><tbody><tr><td align="left" valign="top">Papers</td><td align="left" valign="top">51 (0.04)</td><td align="left" valign="top">37 journals</td><td align="left" valign="top">English (100%)</td></tr><tr><td align="left" valign="top">Official documents</td><td align="left" valign="top">6739 (4.82)</td><td align="left" valign="top">MHLW<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">Japanese (100%)</td></tr><tr><td align="left" valign="top">Social media</td><td align="left" valign="top">132,874 (95.04)</td><td align="left" valign="top">X (formerly known as Twitter)</td><td align="left" valign="top">Japanese (100%)</td></tr><tr><td align="left" valign="top">Chat conversations</td><td align="left" valign="top">139 (0.10)</td><td align="left" valign="top">31 sessions</td><td align="left" valign="top">Japanese (59.3%)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>MHLW: Ministry of Health, Labor, and Welfare.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Chatbot</title><p>On the basis of the large-scale database, we applied simulation of operational use cases.</p><sec id="s3-2-1"><title>Example Response</title><p><xref ref-type="other" rid="box1">Textbox 1</xref> presents a representative question-answer exchange that demonstrates the response characteristics of the chatbot.</p><boxed-text id="box1"><title> Example chatbot response demonstrating professional medical tone, structured information delivery, evidence-based recommendations with citations, and personalized guidance.</title><p><bold>User query</bold></p><p>&#x201C;Are there any specific HPV-related health risks or symptoms that become more common or concerning for people over 65?&#x201D;</p><p><bold>Chatbot response</bold></p><p>&#x201C;Thank you for asking such an important question about HPV-related health risks for people over 65. As a healthcare provider, I want to address your concerns with both the latest medical evidence and an understanding of the unique considerations for older adults.</p><p><italic>Key Health Risks and Symptoms After 65</italic></p><p>While HPV infections can occur at any age, there are some important patterns to</p><p>understand for people over 65. Research shows that while new HPV infections become less common with age, when cervical abnormalities do occur in older adults, they tend to be more significant [1].</p><p>...</p><p><italic>References:</italic></p><p>[1] Prevalence of cervical cytology abnormalities and its correlation with human papillomavirus infection. Medicinski glasnik (2025). DOI: 10.17392/1968-22-02&#x201D;</p></boxed-text></sec><sec id="s3-2-2"><title>Evaluation Results</title><p>The chatbot system was evaluated using simulated conversations generated by 3 volunteers. Conversations spanned diverse user personas of varying age groups, occupations, and levels of concern about HPV vaccination. The questions covered multiple information domains, including vaccine safety, efficacy, eligibility criteria, and procedural guidelines. <xref ref-type="table" rid="table2">Table 2</xref> summarizes the single-turn evaluation results (<xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>) for all 5 assessment dimensions. Average scores ranged from 4.50 to 4.90 on the scale from 0 to 5, with correctness (4.90, 95% CI 4.78-4.99) and routing (4.89, 95% CI 4.79-4.97) achieving the highest scores. Across all dimensions, 90% (125/139) to 99% (137/139) of responses received scores of 4 or higher.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Single-turn evaluation results (n=139 question-answer pairs).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dimension</td><td align="left" valign="bottom">Score (0-5), mean (SD; 95% CI)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom">Score (0-5), median (IQR)</td><td align="left" valign="bottom">Score of 5, %</td><td align="left" valign="bottom">Score of &#x2265;4, %</td><td align="left" valign="bottom">Score of &#x2264;3, %</td></tr></thead><tbody><tr><td align="left" valign="top">Relevance</td><td align="left" valign="top">4.83 (4.71-4.93)</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">91</td><td align="left" valign="top">96</td><td align="left" valign="top">4</td></tr><tr><td align="left" valign="top">Routing</td><td align="left" valign="top">4.89 (4.79-4.97)</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">94</td><td align="left" valign="top">97</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">Reference</td><td align="left" valign="top">4.50 (4.27-4.70)</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">81</td><td align="left" valign="top">90</td><td align="left" valign="top">10</td></tr><tr><td align="left" valign="top">Correctness</td><td align="left" valign="top">4.90 (4.78-4.99)</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">96</td><td align="left" valign="top">99</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">Identity</td><td align="left" valign="top">4.88 (4.78-4.96)</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">93</td><td align="left" valign="top">99</td><td align="left" valign="top">1</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Overall mean 4.80.</p></fn></table-wrap-foot></table-wrap><p>A comparison of multiturn with single-turn evaluations revealed consistent improvements (<xref ref-type="table" rid="table3">Table 3</xref>; details are provided in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>); the overall average increased from 4.80 to 4.98 (+0.18). Topic centering and identity both achieved perfect scores of 5.00 in multiturn settings, indicating that the chatbot maintained natural conversation flow and a consistent professional tone across extended dialogues. Across all dimensions, all responses received scores of 4 or higher.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Multiturn evaluation results (n=31 conversations).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dimension</td><td align="left" valign="bottom">Score (0-5), mean (SD; 95% CI)<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom">Score (0-5), median (IQR)</td><td align="left" valign="bottom">Score of 5, %</td><td align="left" valign="bottom">Score of &#x2265;4, %</td><td align="left" valign="bottom">Score of &#x2264;3, %</td></tr></thead><tbody><tr><td align="left" valign="top">Context memory</td><td align="left" valign="top">4.94 (4.84-5.00)</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">94</td><td align="left" valign="top">100</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Topic centering</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">100</td><td align="left" valign="top">100</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Identity</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">100</td><td align="left" valign="top">100</td><td align="left" valign="top">0</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Overall mean 4.98.</p></fn></table-wrap-foot></table-wrap><p>We compared the automated scores with human expert assessments to validate the reliability of the LLM-based evaluation. Three domain experts independently scored randomly selected subsets of conversations (n=20 for single-turn evaluation and n=11 for multiturn evaluation). We report 3 complementary agreement metrics: mean absolute difference (MAD) between averaged expert and LLM scores, Spearman rank correlation coefficient (&#x03C1;) for item-level ranking agreement, and intraclass correlation coefficient (ICC(3,1)) for interrater reliability among the 3 experts. <xref ref-type="table" rid="table4">Table 4</xref> lists the validation results.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Agreement with human experts and interrater reliability.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Check type and dimension</td><td align="left" valign="bottom">Rater 1</td><td align="left" valign="bottom">Rater 2</td><td align="left" valign="bottom">Rater 3</td><td align="left" valign="bottom">MAD<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="bottom">Spearman &#x03C1; (95% CI)</td><td align="left" valign="bottom">ICC<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup>(3,1) (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">Single-turn evaluation (n=20 conversations)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Relevance</td><td align="left" valign="top">0</td><td align="left" valign="top">0.10</td><td align="left" valign="top">0.05</td><td align="left" valign="top">0.05</td><td align="left" valign="top">0.609 (0.227 to 0.828)<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">0.203 (&#x2212;0.057 to 0.511)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Routing</td><td align="left" valign="top">0.40</td><td align="left" valign="top">1.25</td><td align="left" valign="top">0</td><td align="left" valign="top">0.55</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">0.279 (0.008 to 0.575)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Reference</td><td align="left" valign="top">0.60</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.05</td><td align="left" valign="top">0.47</td><td align="left" valign="top">0.068 (&#x2212;0.386 to 0.496)</td><td align="left" valign="top">0.170 (&#x2212;0.085 to 0.481)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Correctness</td><td align="left" valign="top">0.05</td><td align="left" valign="top">0.20</td><td align="left" valign="top">0</td><td align="left" valign="top">0.08</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2212;0.025 (&#x2212;0.229 to 0.281)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Identity</td><td align="left" valign="top">0.15</td><td align="left" valign="top">0.05</td><td align="left" valign="top">0.55</td><td align="left" valign="top">0.25</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.314 (0.040 to 0.603)</td></tr><tr><td align="left" valign="top" colspan="7">Multiturn evaluation (n=11 conversations)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Context memory</td><td align="left" valign="top">0.13</td><td align="left" valign="top">0.13</td><td align="left" valign="top">0.13</td><td align="left" valign="top">0.13</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.000 (1.000 to 1.000)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Topic centering</td><td align="left" valign="top">0</td><td align="left" valign="top">0.13</td><td align="left" valign="top">0</td><td align="left" valign="top">0.04</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.000 (&#x2212;0.307 to 0.545)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Identity</td><td align="left" valign="top">0</td><td align="left" valign="top">0.13</td><td align="left" valign="top">0.55</td><td align="left" valign="top">0.21</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.174 (&#x2212;0.208 to 0.684)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>MAD: mean absolute difference. Overall MAD: 0.28 for single-turn evaluation and 0.13 for multiturn evaluation.</p></fn><fn id="table4fn2"><p><sup>b</sup>ICC: intraclass correlation coefficient.</p></fn><fn id="table4fn3"><p><sup>c</sup><italic>P</italic>&#x003C;.01.</p></fn><fn id="table4fn4"><p><sup>d</sup>The large language model scores were all the same, and correlation could not be calculated.</p></fn></table-wrap-foot></table-wrap><p>For single-turn evaluation, relevance (MAD=0.05) and correctness (MAD=0.08) exhibited close LLM-human alignment, whereas routing (MAD=0.55) and reference (MAD=0.47) showed larger deviations. The overall MAD of 0.28 on a scale from 0 to 5 represents a deviation of less than 6% from human judgment. The Spearman &#x03C1; was computable only for relevance (&#x03C1;=0.609, 95% CI 0.227-0.828; <italic>P</italic>=.004) and reference (&#x03C1;=0.068; <italic>P</italic>=.78) as the LLM assigned uniform perfect scores for the remaining dimensions, leaving no variance for correlation analysis. This ceiling tendency of the LLM judge is itself a notable finding. Interrater reliability among the 3 experts was low to moderate (ICC range &#x2212;0.025 to 0.314), indicating limited consensus even among human raters, particularly for correctness (ICC=&#x2013;0.025) and reference (ICC=0.170). Per-rater analysis revealed that individual LLM-expert disagreement varied substantially (eg, routing MAD ranged from 0.00 to 1.25 across raters), suggesting that the observed LLM-human discrepancies partly reflect genuine interexpert disagreement rather than systematic LLM bias.</p><p>The multiturn evaluation exhibited closer overall alignment (MAD=0.13; 2.6% deviation), with topic centering achieving near-perfect agreement (MAD=0.04). Context memory showed perfect interrater reliability (ICC=1.000) as all 3 experts assigned identical scores, whereas topic centering (ICC=0.000) and identity (ICC=0.174) showed that the small deviations among raters did not follow consistent patterns&#x2014;a consequence of near-ceiling score distributions. These results suggest that LLM-based evaluation serves as a reasonable proxy for human judgment on dimensions with clear-cut criteria (relevance and correctness) but should be interpreted cautiously for dimensions involving subjective judgment (routing and reference quality), where both the LLM and human raters exhibited greater variability.</p></sec></sec><sec id="s3-3"><title>Report Generation</title><p>The report generation system was evaluated for 4 distinct periods: January 2020, July 2020, September 2020, and October 2020. We provide an example report in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>. For each period, the system generated complete reports by analyzing 30 days of data from all source collections. Three evaluators independently scored each report section, and we reported the mean with 95% bootstrap CIs across evaluators. Both main text assessment (completeness, correctness, and helpfulness) and reference validation (reference validity and citation correctness) protocols were applied to each report section. The detailed scoring results are presented in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>.</p><p><xref ref-type="table" rid="table5">Table 5</xref> summarizes the main text and reference evaluation results. Completeness and correctness both exhibited strong ceiling effects, with medians of 5.00 (IQR 5.00-5.00) and 100% (48/48) of responses achieving scores of 4 or above. Helpfulness showed lower performance (median 4.00, IQR 4.00-4.00; mean 4.12, 95% CI 3.98-4.27), with only 19% (9/48) of responses achieving the maximum score, suggesting room for improvement in generating actionable insights. Reference validity achieved perfect scores across all evaluations (mean 5.00, 95% CI 5.00-5.00). Citation correctness was the most variable dimension (median 4.00, IQR 4.00-5.00; mean 4.21, 95% CI 3.96-4.46), with scores spanning the range of 4 to 5 and only 29% (7/24) of responses achieving perfect scores, indicating that the accuracy of citation-to-claim matching is the primary area for improvement in the report generation pipeline.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Report main text and reference evaluation results.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dimension</td><td align="left" valign="bottom">Score (0-5), mean (SD; 95% CI)</td><td align="left" valign="bottom">Score (0-5), median (IQR)</td><td align="left" valign="bottom">Score of 5, %</td><td align="left" valign="bottom">Score of &#x2265;4, %</td><td align="left" valign="bottom">Score of &#x2264;3, %</td></tr></thead><tbody><tr><td align="left" valign="top">Completeness</td><td align="left" valign="top">4.83 (4.73-4.94)</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">83</td><td align="left" valign="top">100</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Correctness</td><td align="left" valign="top">4.88 (4.77-4.96)</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">88</td><td align="left" valign="top">100</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Helpfulness</td><td align="left" valign="top">4.12 (3.98-4.27)</td><td align="left" valign="top">4.00 (4.00-4.00)</td><td align="left" valign="top">19</td><td align="left" valign="top">94</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Reference validity</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">5.00 (5.00-5.00)</td><td align="left" valign="top">100</td><td align="left" valign="top">100</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Citation correctness</td><td align="left" valign="top">4.21 (3.96-4.46)</td><td align="left" valign="top">4.00 (4.00-5.00)</td><td align="left" valign="top">29</td><td align="left" valign="top">92</td><td align="left" valign="top">0</td></tr></tbody></table></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study demonstrated the feasibility of a dual-purpose AI agent system for HPV vaccine information communication in Japan. This system integrates heterogeneous data sources (academic literature, government documents, news media, and social media) into a unified retrieval infrastructure that supports public-facing conversational interfaces and institutional analytical reporting.</p><p>In this preliminary evaluation, the chatbot achieved single-turn medians of 5.00 across all 5 dimensions (IQR 5.00-5.00), with means ranging from 4.50 (reference) to 4.90 (correctness). The pronounced ceiling effects indicate generally high performance but limit fine-grained differentiation between dimensions. Top-box analysis revealed that reference quality received lower scores (14/139, 10% at or below a score of 3 compared to a range of 2/139, 1% to 6/139, 4% for other dimensions), identifying citation handling as the primary area for improvement. Multiturn evaluation showed even stronger ceiling effects (overall mean 4.98), with topic centering and identity both achieving perfect scores across all 31 conversations. The overall mean improvement from single-turn to multiturn evaluation (+0.18) should be interpreted cautiously as it primarily reflects the elimination of low-scoring outliers rather than a broad performance shift. These preliminary results suggest that the iterative multitool orchestration architecture may maintain factual accuracy while delivering appropriately toned medical communication, although validation with real users is needed.</p><p>The report generation system maintained consistent quality across 4 temporal evaluation periods. Completeness and correctness exhibited strong ceiling effects (median 5.00, IQR 5.00-5.00), confirming reliable document structure and factual accuracy regardless of data availability fluctuations. Helpfulness was the most variable main text dimension (median 4.00, IQR 4.00-4.00; 9/48, 19% of responses achieving perfect scores), suggesting that, while the system reliably produces structurally complete and accurate reports, generating actionable institutional insights&#x2014;particularly for chat analysis sections&#x2014;remains a challenge. Citation correctness showed the widest spread among all report dimensions (median 4.00, IQR 4.00-5.00; 7/24, 29% of responses achieving perfect scores), indicating that the system consistently identifies valid sources (reference validity of median 5.00, IQR 5.00-5.00 across all periods) but is less consistent in matching citations to the specific claims they support. This distinction highlights an important design consideration for automated reporting systems: source retrieval and citation-claim alignment require different optimization strategies.</p></sec><sec id="s4-2"><title>Interpretation</title><p>These findings suggest that LLM-based RAG systems may address the information asymmetry between medical institutions and vaccine-hesitant populations. The chatbot architecture differs from traditional static frequently asked question systems in that it dynamically selects and combines information from specialized knowledge sources, enabling responses that integrate academic evidence with official guidelines and contemporary public discourse.</p><p>The observed improvements in the multiturn evaluation merit consideration. The high context memory scores suggest that the controller incorporates information from previous turns, such as retaining user demographic information when providing age-specific recommendations. Topic centering scores suggest smooth transitions between related topics, resembling the natural progression of clinical consultations from symptoms to screening and prevention. These patterns indicate that the windowed conversation history approach provides sufficient context for a coherent extended dialogue. The low interrater reliability (<xref ref-type="table" rid="table4">Table 4</xref>) among human experts in single-turn evaluation (ICC range &#x2212;0.025 to 0.314) may partly reflect the inherent ambiguity of judging individual question-answer pairs in isolation, where raters lack conversational context to disambiguate routing and reference quality assessments. The higher agreement in multiturn evaluation, where full dialogue context is available (context memory ICC=1.000), supports this interpretation.</p><p>A 2-level citation validation mechanism may contribute to maintaining response quality. Reference scores confirm consistent source attribution, addressing a key concern in health information systems where users must verify claims independently. Although prospective studies are required to confirm this relationship, transparency may also contribute to user trust.</p><p>For institutional stakeholders, the report generation system offers capabilities that would otherwise require substantial manual effort. The consistent structural completeness across the evaluation periods and sections demonstrated reliable document generation regardless of data availability fluctuations. Per-section analysis (<xref ref-type="supplementary-material" rid="app10">Multimedia Appendix 10</xref>) reveals that paper sections consistently achieved the highest scores, whereas chat sections exhibited more variability in helpfulness. The September 2020 social media section maintained acceptable quality despite sparser data, suggesting robustness to temporal variation.</p><p>Japanese-language implementation addresses challenges specific to this context: multiscript processing, medical terminology localization, and culturally appropriate formal communication. The successful integration of Japanese government documents with English-language research literature provides initial evidence of the viability of this approach in settings where scientific evidence and public health communication occur in different languages.</p></sec><sec id="s4-3"><title>Comparison With Prior Work</title><p>This work extends previous HPV vaccine natural language processing research, which has primarily focused on passive social media analysis [<xref ref-type="bibr" rid="ref10">10</xref>], by implementing a bidirectional information flow. Prior health chatbots have been typically retrieved from single knowledge sources [<xref ref-type="bibr" rid="ref11">11</xref>], whereas our iterative multitool architecture integrates 4 heterogeneous collections, enabling responses that synthesize information across source types.</p><p>The evaluation framework extends beyond typical single-turn RAG assessments [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>] by incorporating a multiturn conversation analysis with simulated users. This approach captures the dimensions of conversational coherence, context memory, and topic continuity that single exchange evaluations overlook.</p></sec><sec id="s4-4"><title>Limitations</title><p>This study has several limitations that warrant consideration. Most fundamentally, this work represents a proof-of-concept system evaluation rather than a clinical or public health impact study. The system was not tested with real patients, members of the public, or clinical end users in an operational setting. The evaluation relied on simulated conversations generated by 3 volunteers and report assessments by 3 evaluators over 4 periods, which can be insufficient to support strong generalizable conclusions. Additionally, the social media data were exclusively derived from Japanese X users, potentially underrepresenting the older adult population and those with limited digital access. In addition, the LLM-based evaluation may introduce biases that differ from human judgment, particularly for nuanced routing decisions in which multiple valid tool selections exist. The evaluation dataset (31 conversations and 139 exchanges) may not capture all real-world interaction patterns, and keyword-based data collection introduces potential selection bias. Geographic specificity to the Japanese government constrains transferability to national contexts with different regulatory frameworks and vaccination policies.</p></sec><sec id="s4-5"><title>Future Directions</title><p>Several directions warrant further investigation. Future prospective studies with real users and measurable health outcomes are needed to assess the system&#x2019;s impact on vaccine knowledge, trust, and decision-making, whereas A/B testing will assess whether citation transparency affects user trust. Expansion to other languages and comparative studies across vaccine types would test the generalizability and inform the design principles for vaccine information communication systems facing similar challenges in other contexts.</p></sec><sec id="s4-6"><title>Conclusions</title><p>This study demonstrated the initial feasibility of an AI agent system that simultaneously addresses public HPV vaccine information needs and institutional discourse monitoring in Japan. The integrated architecture may enable bidirectional information flow&#x2014;providing verified information with transparent source attribution to users while generating analytical reports for institutional stakeholders&#x2014;creating feedback loops between public concerns and communication strategies. Although this evaluation relied on simulated users, this study established a proof of concept for an AI-augmented vaccine information communication infrastructure, with the transferable architecture and evaluation frameworks providing foundations for adaptation to other vaccines and health conditions and multilingual public health contexts.</p></sec></sec></body><back><ack><p>The authors used the generative artificial intelligence tool Claude [<xref ref-type="bibr" rid="ref33">33</xref>] by Anthropic to refine the writing and structure of the manuscript, which was further reviewed and revised by the study group. The original Claude transcripts are available in <xref ref-type="supplementary-material" rid="app11">Multimedia Appendix 11</xref>.</p></ack><notes><sec><title>Funding</title><p>No external financial support or grants were received from any public, commercial, or not-for-profit entities for the research, authorship, or publication of this article.</p></sec><sec><title>Data Availability</title><p>Snapshots of data collection for chats, papers, and official documents or web sources used for retrieval-augmented generation, as well as social media IDs, are available in the Hugging Face repository [<xref ref-type="bibr" rid="ref34">34</xref>].</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">HPV</term><def><p>human papillomavirus</p></def></def-item><def-item><term id="abb3">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">MAD</term><def><p>mean absolute difference</p></def></def-item><def-item><term id="abb6">MeSH</term><def><p>Medical Subject Headings</p></def></def-item><def-item><term id="abb7">MHLW</term><def><p>Ministry of Health, Labor, and Welfare</p></def></def-item><def-item><term id="abb8">RAG</term><def><p>retrieval-augmented generation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Global burden of cervical cancer: current estimates, temporal trend and future projections based on the GLOBOCAN 2022</article-title><source>J Natl Cancer Cent</source><year>2025</year><volume>5</volume><issue>3</issue><fpage>322</fpage><lpage>329</lpage><pub-id pub-id-type="doi">10.1016/j.jncc.2024.11.006</pub-id><pub-id pub-id-type="medline">40693230</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Drolet</surname><given-names>M</given-names> </name><name name-style="western"><surname>B&#x00E9;nard</surname><given-names>&#x00C9;</given-names> </name><name name-style="western"><surname>Boily</surname><given-names>MC</given-names> </name><etal/></person-group><article-title>Population-level impact and herd effects following human papillomavirus vaccination programmes: a systematic review and meta-analysis</article-title><source>Lancet Infect Dis</source><year>2015</year><month>05</month><volume>15</volume><issue>5</issue><fpage>565</fpage><lpage>580</lpage><pub-id pub-id-type="doi">10.1016/S1473-3099(14)71073-4</pub-id><pub-id pub-id-type="medline">25744474</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>MacDonald</surname><given-names>NE</given-names> </name><collab>SAGE Working Group on Vaccine Hesitancy</collab></person-group><article-title>Vaccine hesitancy: definition, scope and determinants</article-title><source>Vaccine (Auckl)</source><year>2015</year><month>08</month><day>14</day><volume>33</volume><issue>34</issue><fpage>4161</fpage><lpage>4164</lpage><pub-id pub-id-type="doi">10.1016/j.vaccine.2015.04.036</pub-id><pub-id pub-id-type="medline">25896383</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Simms</surname><given-names>KT</given-names> </name><name name-style="western"><surname>Hanley</surname><given-names>SJB</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Keane</surname><given-names>A</given-names> </name><name name-style="western"><surname>Canfell</surname><given-names>K</given-names> </name></person-group><article-title>Impact of HPV vaccine hesitancy on cervical cancer in Japan: a modelling study</article-title><source>Lancet Public Health</source><year>2020</year><month>04</month><volume>5</volume><issue>4</issue><fpage>e223</fpage><lpage>e234</lpage><pub-id pub-id-type="doi">10.1016/S2468-2667(20)30010-4</pub-id><pub-id pub-id-type="medline">32057317</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>NF</given-names> </name><name name-style="western"><surname>Vel&#x00E1;squez</surname><given-names>N</given-names> </name><name name-style="western"><surname>Restrepo</surname><given-names>NJ</given-names> </name><etal/></person-group><article-title>The online competition between pro- and anti-vaccination views</article-title><source>Nature</source><year>2020</year><month>06</month><volume>582</volume><issue>7811</issue><fpage>230</fpage><lpage>233</lpage><pub-id pub-id-type="doi">10.1038/s41586-020-2281-1</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sylvia Chou</surname><given-names>WY</given-names> </name><name name-style="western"><surname>Gaysynsky</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cappella</surname><given-names>JN</given-names> </name></person-group><article-title>Where we go from here: health misinformation on social media</article-title><source>Am J Public Health</source><year>2020</year><month>10</month><volume>110</volume><issue>S3</issue><fpage>S273</fpage><lpage>S275</lpage><pub-id pub-id-type="doi">10.2105/AJPH.2020.305905</pub-id><pub-id pub-id-type="medline">33001722</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Lewis</surname><given-names>P</given-names> </name><name name-style="western"><surname>Perez</surname><given-names>E</given-names> </name><name name-style="western"><surname>Piktus</surname><given-names>A</given-names> </name><name name-style="western"><surname>Petroni</surname><given-names>F</given-names> </name><name name-style="western"><surname>Karpukhin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name></person-group><article-title>Retrieval-augmented generation for knowledge-intensive NLP tasks</article-title><source>Advances in Neural Information Processing Systems</source><year>2020</year><publisher-name>Curran Associates</publisher-name><fpage>9459</fpage><lpage>9474</lpage></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DS</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kudo</surname><given-names>T</given-names> </name><name name-style="western"><surname>Yamamoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Matsumoto</surname><given-names>Y</given-names> </name></person-group><article-title>Applying conditional random fields to Japanese morphological analysis</article-title><conf-name>Proceedings of the 2004 Conference on Empirical Methods in Natural Language Processing</conf-name><fpage>206</fpage><lpage>213</lpage></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Lerman</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ferrara</surname><given-names>E</given-names> </name></person-group><article-title>Tracking social media discourse about the COVID-19 pandemic: development of a public coronavirus Twitter data set</article-title><source>JMIR Public Health Surveill</source><year>2020</year><month>05</month><day>29</day><volume>6</volume><issue>2</issue><fpage>e19273</fpage><pub-id pub-id-type="doi">10.2196/19273</pub-id><pub-id pub-id-type="medline">32427106</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Laranjo</surname><given-names>L</given-names> </name><name name-style="western"><surname>Dunn</surname><given-names>AG</given-names> </name><name name-style="western"><surname>Tong</surname><given-names>HL</given-names> </name><etal/></person-group><article-title>Conversational agents in healthcare: a systematic review</article-title><source>J Am Med Inform Assoc</source><year>2018</year><month>09</month><day>1</day><volume>25</volume><issue>9</issue><fpage>1248</fpage><lpage>1258</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocy072</pub-id><pub-id pub-id-type="medline">30010941</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ujiie</surname><given-names>M</given-names> </name></person-group><article-title>Resumption of active recommendation of the human papillomavirus vaccine in Japan and future challenges for the National Immunization Program</article-title><source>Hum Vaccin Immunother</source><year>2022</year><month>11</month><day>30</day><volume>18</volume><issue>6</issue><fpage>2090777</fpage><pub-id pub-id-type="doi">10.1080/21645515.2022.2090777</pub-id><pub-id pub-id-type="medline">35767827</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>D</given-names> </name><etal/></person-group><article-title>React: synergizing reasoning and acting in language models</article-title><access-date>2026-05-03</access-date><conf-name>11th International Conference on Learning Representations, ICLR 2023</conf-name><conf-date>May 1-5, 2023</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://collaborate.princeton.edu/en/publications/react-synergizing-reasoning-and-acting-in-language-models/">https://collaborate.princeton.edu/en/publications/react-synergizing-reasoning-and-acting-in-language-models/</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><source>Qdrant</source><access-date>2025-11-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://qdrant.tech/">https://qdrant.tech/</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="web"><article-title>PLaMo-embedding-1B</article-title><source>Hugging Face</source><access-date>2025-11-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/pfnet/plamo-embedding-1b">https://huggingface.co/pfnet/plamo-embedding-1b</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Reimers</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gurevych</surname><given-names>I</given-names> </name></person-group><article-title>Sentence-BERT: sentence embeddings using Siamese BERT-networks</article-title><conf-name>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</conf-name><conf-date>Nov 3-7, 2019</conf-date><pub-id pub-id-type="doi">10.18653/v1/D19-1410</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>J</given-names> </name><name name-style="western"><surname>Douze</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jegou</surname><given-names>H</given-names> </name></person-group><article-title>Billion-scale similarity search with GPUs</article-title><source>IEEE Trans Big Data</source><year>2021</year><volume>7</volume><issue>3</issue><fpage>535</fpage><lpage>547</lpage><pub-id pub-id-type="doi">10.1109/TBDATA.2019.2921572</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><source>PubMed</source><access-date>2025-11-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://pubmed.ncbi.nlm.nih.gov/">https://pubmed.ncbi.nlm.nih.gov/</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><source>World Health Organization</source><access-date>2025-11-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.who.int/">https://www.who.int/</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><source>Ministry of Health, Labour and Welfare, Japan</source><access-date>2025-11-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.mhlw.go.jp/">https://www.mhlw.go.jp/</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><source>Tweepy</source><access-date>2025-11-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.tweepy.org/">https://www.tweepy.org/</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><source>LlamaIndex</source><access-date>2024-10-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.llamaindex.ai/">https://www.llamaindex.ai/</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><source>Streamlit</source><access-date>2025-11-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://streamlit.io/">https://streamlit.io/</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>T</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>K</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Social media as a research tool (SMaaRT) for risky behavior analytics: methodological review</article-title><source>JMIR Public Health Surveill</source><year>2020</year><month>11</month><day>30</day><volume>6</volume><issue>4</issue><fpage>e21660</fpage><pub-id pub-id-type="doi">10.2196/21660</pub-id><pub-id pub-id-type="medline">33252345</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Broniatowski</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Jamison</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Qi</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Weaponized health communication: Twitter bots and Russian trolls amplify the vaccine debate</article-title><source>Am J Public Health</source><year>2018</year><month>10</month><volume>108</volume><issue>10</issue><fpage>1378</fpage><lpage>1384</lpage><pub-id pub-id-type="doi">10.2105/AJPH.2018.304567</pub-id><pub-id pub-id-type="medline">30138075</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blei</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>AY</given-names> </name><name name-style="western"><surname>Jordan</surname><given-names>MI</given-names> </name></person-group><article-title>Latent Dirichlet allocation</article-title><source>J Mach Learn Res</source><year>2003</year><volume>3</volume><fpage>993</fpage><lpage>1022</lpage><pub-id pub-id-type="doi">10.5555/944919.944937</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kudo</surname><given-names>T</given-names> </name><name name-style="western"><surname>Yamamoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Matsumoto</surname><given-names>Y</given-names> </name></person-group><article-title>Applying conditional random fields to Japanese morphological analysis</article-title><source>Proceedings of the 2004 Conference on Empirical Methods in Natural Language Processing</source><year>2004</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>230</fpage><lpage>237</lpage></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salton</surname><given-names>G</given-names> </name><name name-style="western"><surname>Buckley</surname><given-names>C</given-names> </name></person-group><article-title>Term-weighting approaches in automatic text retrieval</article-title><source>Inf Process Manag</source><year>1988</year><volume>24</volume><issue>5</issue><fpage>513</fpage><lpage>523</lpage><pub-id pub-id-type="doi">10.1016/0306-4573(88)90021-0</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>&#x0158;eh&#x016F;&#x0159;ek</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sojka</surname><given-names>P</given-names> </name></person-group><article-title>Software framework for topic modelling with large corpora</article-title><conf-name>Proceedings of the LREC 2010 Workshop on New Challenges for NLP Frameworks</conf-name><conf-date>May 22, 2010</conf-date><pub-id pub-id-type="doi">10.13140/2.1.2393.1847</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><article-title>Ethical guidelines for medical and biological research involving human subjects</article-title><source>Ministry of Education, Culture, Sports, Science and Technology, Ministry of Health, Labour and Welfare, Ministry of Economy, Trade and Industry, Japan</source><year>2021</year><access-date>2026-05-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.mext.go.jp/content/20250325-mxt_life-000035486-01.pdf">https://www.mext.go.jp/content/20250325-mxt_life-000035486-01.pdf</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Es</surname><given-names>S</given-names> </name><name name-style="western"><surname>James</surname><given-names>J</given-names> </name><name name-style="western"><surname>Espinosa-Anke</surname><given-names>L</given-names> </name><name name-style="western"><surname>Schockaert</surname><given-names>S</given-names> </name></person-group><article-title>Ragas: automated evaluation of retrieval augmented generation</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 26, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.15217</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Gan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tong</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name></person-group><article-title>Evaluation of retrieval-augmented generation: a survey</article-title><source>arXiv</source><comment>Preprint posted online on  May 13, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.07437</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><source>Claude</source><access-date>2024-10-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.anthropic.com/claude">https://www.anthropic.com/claude</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><article-title>Humanalysis-square/HPV_agent_data</article-title><source>Hugging Face</source><access-date>2025-12-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/datasets/humanalysis-square/HPV_agent_data">https://huggingface.co/datasets/humanalysis-square/HPV_agent_data</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Single-turn chat scoring metrics.</p><media xlink:href="infodemiology_v6i1e90295_app1.txt" xlink:title="TXT File, 7 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Multiple-turn chat scoring metrics.</p><media xlink:href="infodemiology_v6i1e90295_app2.txt" xlink:title="TXT File, 4 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Guidance for the volunteers to chat with the chatbot with different personas.</p><media xlink:href="infodemiology_v6i1e90295_app3.docx" xlink:title="DOCX File, 15 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Main text scoring metrics for the generated report.</p><media xlink:href="infodemiology_v6i1e90295_app4.txt" xlink:title="TXT File, 2 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Reference scoring metrics for the generated report.</p><media xlink:href="infodemiology_v6i1e90295_app5.txt" xlink:title="TXT File, 1 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Chat single-turn evaluation results.</p><media xlink:href="infodemiology_v6i1e90295_app6.txt" xlink:title="TXT File, 521 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Chat multiple-turn evaluation results.</p><media xlink:href="infodemiology_v6i1e90295_app7.txt" xlink:title="TXT File, 397 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8 </label><p>Example of a generated report.</p><media xlink:href="infodemiology_v6i1e90295_app8.pdf" xlink:title="PDF File, 537 KB"/></supplementary-material><supplementary-material id="app9"><label>Multimedia Appendix 9</label><p>Scores for the generated reports.</p><media xlink:href="infodemiology_v6i1e90295_app9.xlsx" xlink:title="XLSX File, 17 KB"/></supplementary-material><supplementary-material id="app10"><label>Multimedia Appendix 10</label><p>Section-level evaluation results for the generated reports.</p><media xlink:href="infodemiology_v6i1e90295_app10.txt" xlink:title="TXT File, 1 KB"/></supplementary-material><supplementary-material id="app11"><label>Multimedia Appendix 11</label><p>Record of chat with artificial intelligence tool.</p><media xlink:href="infodemiology_v6i1e90295_app11.docx" xlink:title="DOCX File, 2310 KB"/></supplementary-material></app-group></back></article>