<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Infodemiology</journal-id><journal-id journal-id-type="publisher-id">infodemiology</journal-id><journal-id journal-id-type="index">38</journal-id><journal-title>JMIR Infodemiology</journal-title><abbrev-journal-title>JMIR Infodemiology</abbrev-journal-title><issn pub-type="epub">2564-1891</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e75493</article-id><article-id pub-id-type="doi">10.2196/75493</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Data Mining Trauma: AI-Assisted Qualitative Study of Cyber Victimization on Reddit</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Antisdel</surname><given-names>J'Andra</given-names></name><degrees>PhD, RN</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Miller</surname><given-names>Wendy R</given-names></name><degrees>PhD, RN</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Groves</surname><given-names>Doyle</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Center for Enhancing Quality of Life in Chronic Illness, School of Nursing, Indiana University Indianapolis</institution><addr-line>600 Barnhill Drive</addr-line><addr-line>Indianapolis</addr-line><addr-line>IN</addr-line><country>United States</country></aff><aff id="aff2"><institution>Center for Integrated Healthcare Education, Saint Mary's College</institution><addr-line>Notre Dame</addr-line><addr-line>IN</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Haupt</surname><given-names>Michael</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Gupta</surname><given-names>Ankit</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Potla</surname><given-names>Ravi Teja</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mohanadas</surname><given-names>Sadhasivam</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Guo</surname><given-names>Song-Bin</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to J'Andra Antisdel, PhD, RN, Center for Enhancing Quality of Life in Chronic Illness, School of Nursing, Indiana University Indianapolis, 600 Barnhill Drive, Indianapolis, IN, 46202, United States, 1 574-703-4472; <email>jalantis@iu.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>all authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>3</day><month>9</month><year>2025</year></pub-date><volume>5</volume><elocation-id>e75493</elocation-id><history><date date-type="received"><day>04</day><month>04</month><year>2025</year></date><date date-type="rev-recd"><day>07</day><month>07</month><year>2025</year></date><date date-type="accepted"><day>20</day><month>07</month><year>2025</year></date></history><copyright-statement>&#x00A9; J'Andra Antisdel, Wendy R Miller, Doyle Groves. Originally published in JMIR Infodemiology (<ext-link ext-link-type="uri" xlink:href="https://infodemiology.jmir.org">https://infodemiology.jmir.org</ext-link>), 3.9.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Infodemiology, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://infodemiology.jmir.org/">https://infodemiology.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://infodemiology.jmir.org/2025/1/e75493"/><abstract><sec><title>Background</title><p>Cyber victimization exposes individuals to numerous risks. Developmental and psychological factors may leave some users unaware of the potential dangers, increasing their susceptibility to psychological distress. Despite this vulnerability, methods for identifying those at risk of cyber victimization within health care settings are limited, as is research that explores their experiences of cyber victimization. The purpose of this study was to analyze how users describe experiences of cyber victimization on the social media platform Reddit (Reddit, Inc) using data mining.</p></sec><sec><title>Objective</title><p>This study aimed to analyze and describe how users on Reddit describe and discuss their experience of cyber victimization using data mining and computational analysis of unsolicited data.</p></sec><sec sec-type="methods"><title>Methods</title><p>This computational qualitative study used data mining, Word Adjacency Graph (WAG) modeling, and thematic analysis to analyze discussions of Reddit users surrounding cyber victimization. Inclusion criteria included posts from 2012 to 2023 from subreddits r/cyberbullying and r/bullying. GPT-4 (OpenAI), an advanced artificial intelligence language model, summarized posts and assisted in cluster labeling. Posts were reviewed to remove irrelevant content and duplicates. User anonymity was maintained throughout the study.</p></sec><sec sec-type="results"><title>Results</title><p>A total of 13,381 posts from 3283 Reddit were analyzed, with approximately 5.1% (n=678) originating between 2012 and 2018 and 94.9% (n=12,703) from 2019 to 2023. The WAG modeling approach identified 38 clusters, with 35 deemed to be relevant to cyber victimization experiences. Two clusters containing irrelevant material were excluded. Six overarching themes emerged: (1) psychological impact, (2) coping and healing, (3) protecting yourself online, (4) protecting yourself offline, (5) victimization across various settings, and (6) seeking meaning and understanding.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The study highlights the effectiveness of data mining and AI in analyzing large public datasets for qualitative research. These methods can inform future studies on risky internet behavior, victimization, and assessment strategies in health care settings.</p></sec></abstract><kwd-group><kwd>cyber victimization</kwd><kwd>word adjacency graphing</kwd><kwd>cyberbullying</kwd><kwd>artificial intelligence</kwd><kwd>data mining</kwd><kwd>thematic analysis</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Cyber victimization refers to harmful experiences that occur through the internet, social media, or communication devices. These experiences are often psychologically distressing and have been linked to depression, anxiety, self-harm, and suicidal ideation [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>Although well-documented as a public health concern [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>], there are limitations in the research, including a lack of studies exploring cyber victimization from the perspectives of those who experience it. Traditional qualitative studies have contributed valuable insights into cyber victimization experiences [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>] but are often limited by small sample sizes and researcher bias [<xref ref-type="bibr" rid="ref10">10</xref>]. In addition, instruments to detect cyber victimization often have different methods for operationalization, using specific terms or providing brief descriptions of acts or experiences [<xref ref-type="bibr" rid="ref11">11</xref>]. The terminology researchers use to define cyber victimization also may not align with the individual perceptions of the experience. For example, in 1 study [<xref ref-type="bibr" rid="ref12">12</xref>], users provided more reliable responses when the term &#x201C;cyber victimization&#x201D; was used rather than &#x201C;cyberbullying,&#x201D; suggesting that language choices influence how participants relate to research prompts.</p><p>Data mining and computational qualitative analysis, which involves using computer algorithms and software to collect and analyze qualitative data, is a novel method for understanding cyber victimization. This method has been successfully used to study various public health issues such as substance abuse [<xref ref-type="bibr" rid="ref13">13</xref>], epilepsy [<xref ref-type="bibr" rid="ref14">14</xref>], and intimate partner violence [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>By mining data from social media sites, researchers can access large-scale data that reflect participants&#x2019; genuine, organic thoughts, feelings, and discussions. Analyzing this user data can reveal patterns and trends that are not apparent using traditional research methods. This information can then be used to inform the development and implementation of interventions tailored to a specific population&#x2019;s unique experiences and needs.</p><p>In this study, we applied data mining and computational qualitative analysis to explore how individuals describe and discuss their experience of cyber victimization on the social media platform Reddit (Reddit, Inc). Our aim was to identify patterns and themes in unsolicited narratives. The findings of this study will inform future interventions and improve methods for identifying and supporting individuals who experience cyber victimization.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design Overview</title><p>This qualitative computational analysis used data mining and Word Adjacency Graph (WAG) modeling [<xref ref-type="bibr" rid="ref16">16</xref>] to examine cyber victimization narratives shared by users on Reddit. Data were collected from 2 subreddits, r/cyberbullying and r/bullying, over an 11-year period (2012&#x2010;2023) to capture relevant trends and patterns in discussions surrounding cyber victimization. A systematic data extraction process was conducted using Reddit&#x2019;s application programming interface (API) and a custom web-scraping tool to gather posts and comments. After data cleaning to remove irrelevant duplicates and bot-generated content, WAG modeling was applied to identify patterns and thematic clusters within the text. Following cluster identification, GPT-4 (OpenAI) was used to generate preliminary labels and summaries, which were then manually reviewed for accuracy. A keyword searching process was also conducted to account for evolving language, slang, and abbreviations. Finally, a thematic analysis was performed to refine clusters into relevant themes.</p></sec><sec id="s2-2"><title>Study Setting and Population</title><p>Reddit is a widely used social media and news aggregate platform with over 50 million active users [<xref ref-type="bibr" rid="ref17">17</xref>]. It is known for its anonymous membership, allowing users to share their thoughts and opinions without revealing their identity. The platform is divided into over 10,000 &#x201C;subreddits&#x201D; with a wide range of topics, from current events and politics to hobbies and interests [<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>Although there are approximations regarding the age demographics of Reddit [<xref ref-type="bibr" rid="ref19">19</xref>], Reddit does not collect demographic data, making it impossible to determine the exact demographic of users. Despite this limitation, Reddit has been used in previous research studies, providing valuable insights into mental health topics [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. As demographic information cannot be verified, this study uses the term &#x201C;users&#x201D; to describe individuals who authored posts and comments. The study targeted subreddits r/cyberbullying and r/bullying for data mining, as these communities encourage personal discussions of cyber victimization. Data extraction focused on titles, post bodies, and comments from 2012&#x2010;2023.</p></sec><sec id="s2-3"><title>Data Mining and Computational Analysis</title><p>This study used data mining and WAG modeling [<xref ref-type="bibr" rid="ref16">16</xref>] to examine discussions on Reddit about cyber victimization. Data mining was first conducted, followed by WAG modeling, to reveal patterns and relationships between words and concepts and identify common themes within the text. Data mining consultations were conducted to enhance the validity and to confirm that the process was in alignment with the aim of the study, which was to analyze how users on Reddit describe and discuss their experience of cyber victimization. Following a systematic approach [<xref ref-type="bibr" rid="ref23">23</xref>], data mining comprised 6 stages: collecting requirements, data investigation, data collection, modeling, assessment, and presentation [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref30">30</xref>]. These are outlined in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. A visual overview of the full methodological process is provided in <xref ref-type="fig" rid="figure1">Figure 1</xref>, and the topic detection process using WAG modeling is illustrated in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the methodological process. AI: artificial intelligence; API: application programming interface; WAG: Word Adjacency Graph;</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="infodemiology_v5i1e75493_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Topic detection via Word Adjacency Graph modeling.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="infodemiology_v5i1e75493_fig02.png"/></fig><sec id="s2-3-1"><title>Thematic Analysis of Clusters</title><p>Following labeling, clusters were thematically analyzed with MAXQDA 2022 (VERBI Software GmbH), a qualitative data management software program [<xref ref-type="bibr" rid="ref29">29</xref>], to organize clusters into overlapping themes. The full dataset was organized and prepared in a Microsoft Excel document, arranged by cluster, as well as into categories of weak clusters and those not fitting into any cluster. Each post or comment within the clusters was assigned an individual identification number. For example, when a post or comment is the 61st data point within cluster 4, it would be labeled as &#x201C;C4-61.&#x201D;</p><p>Posts and comments organized by cluster were transcribed into MAXQDA and coded with key phrases based on the content and context. Phrases that were similar in context were grouped and organized into themes. Following the categorization of the clusters into themes, another researcher independently reviewed these themes to ensure accuracy and consistency. This review process involved a thorough examination of how the themes were derived, ensuring that they accurately reflected the key phrases and context from the original posts and comments. The reviewer also assessed the alignment of the themes with the overall objectives of the study, adjusting where necessary to address any discrepancies or oversights.</p></sec></sec><sec id="s2-4"><title>Ethical Considerations</title><p>This study was reviewed by the Institutional Review Board at Indiana University and determined to be exempt under category 4(i): publicly available information or specimens (Protocol #18415; initial approval February 28, 2023). The exemption was granted because the research involved analysis of publicly accessible Reddit posts without direct interaction with human subjects. Informed consent was not sought, as the data were unsolicited, publicly available, and collected in accordance with established guidelines for internet-based research on publicly accessible content without user interaction, as outlined by Eysenbach and Till [<xref ref-type="bibr" rid="ref31">31</xref>]. To protect privacy and confidentiality, no usernames, profile information, or other potentially identifying details were stored or reported, and example quotes were paraphrased when necessary to minimize traceability via search engines. All electronic data were collected and stored on encrypted devices. Data collection complied with Reddit&#x2019;s API access policies [<xref ref-type="bibr" rid="ref32">32</xref>]. Funding for this research was made possible (in part) by Grant Number 5H79SM080386-05 from the Substance Abuse and Mental Health Services Administration (SAMHSA).</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>This study successfully applied data mining, WAG modeling, GPT-4&#x2013;assisted labeling, and thematic analysis to examine cyber victimization narratives shared by users on Reddit. The extracted dataset comprised 13,381 posts and comments from 3283 unique Reddit users. Approximately 5.1% (n=678) of the posts were posted between 2012 and 2018. The remaining 94.9% (n=12,703) of the posts were posted to Reddit from 2019&#x2010;2023.</p><p>To construct the WAG model, only words and word pairs appearing more than 10 times were included, resulting in 150 unique words and 123 word pairs for analysis. As a result, 15% (n=1965) of posts and comments were strongly categorized, 62% (n=8290) were weakly categorized, and 23% (n=2984) were not fitted into the model. The posts that were strongly categorized formed the basis of the 38 clusters (<xref ref-type="fig" rid="figure3">Figure 3</xref>), which were then visualized using Gephi (Gephi Consortium). In this visualization, each node represents a word pair, with node size reflecting word frequency and proximity to similar words. Of the 38 clusters, 35 were relevant to cyber victimization experiences, while 2 clusters were excluded due to irrelevance. In addition, the WAG modeling process placed the word pairs of &#x201C;law enforcement&#x201D; and &#x201C;legal action&#x201D; into 2 clusters, but due to their overlapping content, they were merged into a single cluster (<xref ref-type="fig" rid="figure4">Figure 4</xref>).</p><p>The distribution of posts and comments across clusters varied, with some clusters containing a high volume of discussion while others had minimal engagement (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Clusters 1 and 2 had the highest number of posts and comments, followed by clusters 3 and 0. Some clusters contained fewer than 20 posts and comments, indicating less frequent discussion of those themes. A full list of cluster labels and their thematic categorization is provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Word Adjacency Graph results cluster numbers.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="infodemiology_v5i1e75493_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Word Adjacency Graph results cluster labels.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="infodemiology_v5i1e75493_fig04.png"/></fig></sec><sec id="s3-2"><title>Cluster Labeling and Validation</title><p>All Clusters were manually reviewed and excluded from the model in instances in which users were identified as automated bots or where posts consisted of research recruitment by researchers. The analysis involved screening for data skewness caused by the overrepresentation of individual users. This occurred in situations where users repeatedly posted similar content, which artificially inflated the presence of certain themes or topics in the dataset. Such repetitive postings, not indicative of genuine user interactions, were identified and excluded to ensure the integrity and representativeness of the data.</p><p>In analyzing online interactions and narratives surrounding cyber victimization, GPT-4 was initially used to label the resulting 38 clusters (see <xref ref-type="supplementary-material" rid="app3">Multimedia Appendices 2</xref>&#x2013;<xref ref-type="supplementary-material" rid="app2">3</xref>,<xref ref-type="supplementary-material" rid="app4">4</xref>). GPT-4 synthesized a subset of posts and comments from each cluster to generate suggestive labels. The mean number of posts per cluster was 22 (SD 1) , with a typical range of 13&#x2010;30 posts; the smallest cluster included 7 posts, and the largest contained 44. These sample sizes were shaped by technical limitations at the time of the study (2023) when the GPT-4 model&#x2019;s context window restricted how much data could be processed at once. At that time, approximately 819 (40%) posts of the total cluster data were used in GPT-4&#x2013;assisted label generation.</p><p>Each suggested label was subsequently evaluated by a human reviewer. The manual review process relied on five validation categories: (1) retained, (2) adjusted, (3) revised, (4) merged, and (5) excluded, which were applied based on how closely the GPT-4 suggested label aligned with the cluster data. As a result, 12 labels were retained with only minimal word changes, 12 labels were adjusted, meaning the suggested label generally matched the content but required rewording to improve clarity or reflect the data more accurately, and 10 labels were completely revised to align with the content of the posts. Two clusters were merged due to thematic overlap, and 2 were excluded from the analysis due to irrelevance (see Multimedia Appendix 4).</p><p>To enhance interpretive rigor, a secondary researcher reviewed all cluster labels and confirmed their alignment with the underlying post content and resulting themes. While formal interrater reliability metrics were not calculated, consensus was achieved through collaborative review and discussion.</p><p>Overall, GPT-4 provided a usable starting point for 36 out of 38 clusters (94.7%), with 26 clusters (68%) requiring some degree of human refinement. Definitions and illustrative examples of each validation category are provided in Multimedia <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>.</p></sec><sec id="s3-3"><title>Focused Keyword Searching</title><p>Focused keyword searching was used to expand the analysis of clustered data and improve the identification of themes. This approach involved reviewing the narratives within each cluster to identify key concepts, expressions, or themes. From these, related terms and alternative phrasings were generated and used as targeted search terms across the dataset. For example, a narrative discussing a specific form of cyberbullying, like blackmail, would lead to the identification of related terms such as &#x201C;blackmail&#x201D; and &#x201C;catfish&#x201D; (<xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>). This strategy allowed for the identification of narratives that were weakly clustered or not clustered at all due to variations in words or phrasing. The total number of posts retrieved via focused keyword searching was not tracked. However, the process did lead to the identification of additional narratives that were thematically aligned but not strongly categorized. These supplemental posts helped confirm existing themes and broader representation of experiences.</p></sec><sec id="s3-4"><title>Thematic Analysis</title><p>Following labeling, clusters were thematically analyzed with MAXQDA 2022, a qualitative data management software program [<xref ref-type="bibr" rid="ref29">29</xref>], to organize clusters into overlapping themes. Posts and comments organized by cluster were transcribed into MAXQDA and read and coded with key phrases based on the content and context. Phrases that were similar in context were grouped and organized into themes.</p><p>As a result of this analysis, six themes emerged: (1) psychological impact, which examines the symptoms of cyber victimization; (2) coping and healing, focusing on healing and overcoming cyber victimization and seeking support; (3) protecting yourself online, highlighting methods for preventing or stopping cyber victimization; (4) protecting yourself offline, detailing methods to decrease the risk of being targeted in the physical world; (5) victimization across various settings, exploring the dynamics of victimization in different environments; and (6) seeking meaning and understanding, which includes philosophical discussions about the nature of victimization. A summary table outlining each theme with supporting subtopics is provided in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>. A detailed thematic analysis of the qualitative findings will be presented in a separate publication where the themes will be explored in-depth, along with direct quotes and case examples.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study identified 6 overarching themes in Reddit posts related to cyber victimization: psychological impact, coping and healing, protecting oneself online, protecting oneself offline, victimization across various settings, and seeking meaning and understanding.</p><p>By following a structured, hybrid analytic process [<xref ref-type="fig" rid="figure1">Figure 1</xref>] combining data mining, WAG modeling, and GPT-4&#x2013;assisted labeling, this study demonstrates the effectiveness of computational qualitative methods in analyzing large-scale, unsolicited data on cyber victimization. This approach addressed the limitations of traditional qualitative research, especially in the context of handling large amounts of unstructured data. In addition, traditional qualitative research methods are often limited by participant selection biases and social desirability biases, which limit the breadth and depth of the narratives [<xref ref-type="bibr" rid="ref10">10</xref>]. The anonymity of Reddit also encouraged users to share sensitive information without fear of stigma. This approach facilitated the identification of patterns and emerging themes that may not have been captured through manual coding alone.</p><p>This study provided a novel methodological approach for examining cyber victimization experiences and highlights the potential for AI-assisted qualitative analysis. GPT-4 was used in initial cluster labeling, which, when combined with manual review, improved the accuracy of thematic categorization. This builds on previous qualitative research by demonstrating how computational tools can be applied to analyze unsolicited narratives, reduce researcher bias, and identify themes across a vast dataset.</p><p>While hybrid computational-qualitative methods have been applied to topics such as substance abuse [<xref ref-type="bibr" rid="ref13">13</xref>], epilepsy [<xref ref-type="bibr" rid="ref14">14</xref>], and intimate partner violence [<xref ref-type="bibr" rid="ref15">15</xref>], this study extends that work by applying similar techniques to cyber victimization. By doing so, it demonstrates the adaptability of WAG modeling and GPT-4&#x2013;assisted labeling to new areas of public health. Recent studies support this hybrid approach. For example, Piper and Wu [<xref ref-type="bibr" rid="ref33">33</xref>] found that large language models (LLMs) performed well in narrative topic labeling, while Castellanos et al [<xref ref-type="bibr" rid="ref34">34</xref>] demonstrated that although GPT-4 generated themes aligned with human coding in over 79% of cases, human coders were still required for accuracy. Our integration of GPT-4&#x2013;assisted labeling with manual review aligns with these findings and demonstrates the need for human oversight.</p><p>The findings have practical implications for health care settings. Recognizing the diverse ways users describe psychological impacts and coping strategies could inform the development of educational resources, screening instruments, and assessment strategies that reflect the language and experiences of victims. These resources would be valuable to health care professionals in identifying individuals at risk in primary care or mental health settings where cyber victimization may go unreported.</p></sec><sec id="s4-2"><title>Limitations</title><p>While this study has many strengths, it is not without limitations. Our data consisted of anonymous user narratives from Reddit, making it challenging to determine the generalizability of the sample to a wider population. Reddit users also may have specific characteristics, interests, or behaviors that are not reflective of a broader population [<xref ref-type="bibr" rid="ref35">35</xref>]. In addition, social media platforms have distinct user demographics and cultures, which might influence the nature and extent of cyber victimization experienced by users. For example, Reddit users are more likely to be men [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>], while TikTok (ByteDance Ltd), Facebook (Meta Platforms), and Pinterest (Pinterest, Inc) users are more likely to be women [<xref ref-type="bibr" rid="ref37">37</xref>].</p><p>Due to the inability to follow up with users for explanation and clarification and the inability to meet users face-to-face to assess body language, vocal tones, and facial expressions, there was a potential for misinterpretation of context [<xref ref-type="bibr" rid="ref38">38</xref>]. The anonymous nature of Reddit may also lead to false answers or misinformation, as internet-based platforms can be prone to exaggeration, false claims, or recall bias. In addition, demographic inferences cannot be made, as users are not required to disclose personal information.</p><p>An important limitation of this research is the potential inclusion of bot-generated content [<xref ref-type="bibr" rid="ref39">39</xref>]. Despite efforts to identify and exclude bots based on patterns in posting behavior, timing, frequency, language use, and identifiable usernames, the sophisticated nature of some bots may have allowed them to bypass detection. It is possible that bot-generated posts, which do not reflect human experiences, were incorporated into the dataset, potentially influencing the results.</p><p>Another limitation relates to the evolving nature of LLMs. GPT-4, the model used at the time of this study (2023), had a significantly smaller context window, which limited the number of posts that could be processed per cluster. As a result, only a subset of cluster content was used for label identification. Newer versions of the model support much larger input sizes, which may produce different results, affecting replication in future studies.</p><p>Finally, while GPT-4 accelerated the qualitative analysis process, several limitations must be acknowledged. LLMs are prone to selective summarization and misrepresentation, known as hallucination [<xref ref-type="bibr" rid="ref40">40</xref>]. LLMs may also simplify content while being overconfident in tone, which can influence researchers&#x2019; judgment by making inaccurate or biased content appear more credible than it is. These limitations may have affected the accuracy of cluster label identification. Manual validation was used to mitigate these risks. However, the reliance on GPT-4 for suggestive labeling remains a methodological limitation worth noting.</p></sec><sec id="s4-3"><title>Implications for Further Research</title><p>Future research could expand this methodology to further explore and deepen the understanding of cyber victimization experiences and further refine computational qualitative analysis techniques. While this study focused on cyber victimization experiences in 2 subreddits, other communities may provide further insight into cyber victimization. Future studies could extend data mining and WAG modeling to specific types of cyber victimization-related subreddits such as r/stalking, r/cyber security, and r/scams, which focus on distinct aspects of harmful experiences. Cyber victimization is broad; these specific subreddits could provide a more nuanced understanding of how different forms of cyber victimization are discussed within internet-based communities.</p><p>To explore how cyber victimization experiences vary across internet-based spaces with different user demographics and privacy structures, future research could compare narratives from different platforms (TikTok [ByteDance Ltd], Discord [Discord Inc], Bluesky [Bluesky PBLLC], Instagram [Meta Platforms], and Tumblr [Automattic]). Each platform has unique privacy settings, user demographics, and moderation policies, which may influence how users discuss and experience cyber victimization.</p></sec><sec id="s4-4"><title>Conclusions</title><p>This study used a hybrid methodological approach to analyze how users on Reddit describe their experience of cyber victimization using data mining and computational analysis of unsolicited data. By leveraging data mining and WAG modeling, this study demonstrated the effectiveness of computational methods in qualitative analysis. GPT-4-assisted labeling and focused keyword searching further refined thematic identification, resulting in 6 themes: psychological impact, coping and healing, protecting oneself online, protecting oneself offline, victimization across various settings, and seeking meaning and understanding. The methodological approach demonstrated in this study will be valuable to data scientists and health care researchers seeking to analyze social media data on mental health issues. These methods can inform future studies on risky internet behavior, victimization, and assessment strategies in health care settings.</p></sec></sec></body><back><ack><p>Funding for this research was made possible (in part) by Grant Number 5H79SM080386-05 from SAMHSA. The views expressed in written training materials or publications and by speakers and moderators do not necessarily reflect the official policies of the Department and Human Services; nor does mention of trade names, commercial practices, or organizations imply endorsement by the U.S. Government.</p><p>The author would like to acknowledge the contributions of Indiana University School of Nursing for its support in facilitating this research. Special thanks to Dr. Ukamaka Oruche for their guidance and feedback throughout the development of this study.</p></ack><notes><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are not publicly available due the sensitive personal narratives of cyber victimization but are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>JA took the lead in conceptualization, with equal contributions from WRM. Data curation was primarily carried out by JA, with equal contributions from DG. Formal analysis was led by JA, with equal contributions from DG and WRM. Funding acquisition was led by JA, with supporting contributions from WRM. Investigation was led by JA, with supporting contributions from WRM and DG. Methodology was led by JA, with equal contributions from DG and WRM. Project administration was led by WRM, with equal contributions from JA. Resources were provided by WRM, with equal contributions from JA. Software was not applicable. Supervision and validation were carried out by WRM. Visualization was completed by DG. The original draft was written primarily by JA, with supporting contributions from WRM and DG. Review and editing were led by JA, with equal contributions from WRM and DG.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">GPT-4</term><def><p>Generative pre-trained transformer</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>Large language model</p></def></def-item><def-item><term id="abb5">SAMHSA</term><def><p>Substance Abuse and Mental Health Services Administration</p></def></def-item><def-item><term id="abb6">WAG</term><def><p>Word Adjacency Graph</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>C&#x00E9;nat</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>K</given-names> </name><name name-style="western"><surname>H&#x00E9;bert</surname><given-names>M</given-names> </name><name name-style="western"><surname>Derivois</surname><given-names>D</given-names> </name></person-group><article-title>Cybervictimization and suicidality among French undergraduate Students: A mediation model</article-title><source>J Affect Disord</source><year>2019</year><month>04</month><day>15</day><volume>249</volume><fpage>90</fpage><lpage>95</lpage><pub-id pub-id-type="doi">10.1016/j.jad.2019.02.026</pub-id><pub-id pub-id-type="medline">30769296</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>D</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Cyber victimization and adolescent depression: The mediating role of psychological insecurity and the moderating role of perceived social support</article-title><source>Child Youth Serv Rev</source><year>2018</year><month>11</month><volume>94</volume><fpage>10</fpage><lpage>19</lpage><pub-id pub-id-type="doi">10.1016/j.childyouth.2018.09.027</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rose</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Tynes</surname><given-names>BM</given-names> </name></person-group><article-title>Longitudinal associations between cybervictimization and mental health among U.S. adolescents</article-title><source>J Adolesc Health</source><year>2015</year><month>09</month><volume>57</volume><issue>3</issue><fpage>305</fpage><lpage>312</lpage><pub-id pub-id-type="doi">10.1016/j.jadohealth.2015.05.002</pub-id><pub-id pub-id-type="medline">26115909</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>John</surname><given-names>A</given-names> </name><name name-style="western"><surname>Glendenning</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Marchant</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Self-harm, suicidal behaviours, and cyberbullying in children and young people: systematic review</article-title><source>J Med Internet Res</source><year>2018</year><month>04</month><day>19</day><volume>20</volume><issue>4</issue><fpage>e129</fpage><pub-id pub-id-type="doi">10.2196/jmir.9044</pub-id><pub-id pub-id-type="medline">29674305</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sampasa-Kanyinga</surname><given-names>H</given-names> </name><name name-style="western"><surname>Roumeliotis</surname><given-names>P</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name></person-group><article-title>Associations between cyberbullying and school bullying victimization and suicidal ideation, plans and attempts among Canadian schoolchildren</article-title><source>PLoS ONE</source><year>2014</year><volume>9</volume><issue>7</issue><fpage>1</fpage><lpage>9</lpage><pub-id pub-id-type="doi">10.1371/journal.pone.0102145</pub-id><pub-id pub-id-type="medline">25076490</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="web"><article-title>House Committee on Energy and Commerce Subcommittee on Consumer Protection and Commerce (Committee on Energy and Commerce)</article-title><source>Kids online during COVID: Child safety in an increasingly digital age.</source><year>2021</year><access-date>2025-08-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://docs.house.gov/Committee/Calendar/ByEvent.aspx?EventID=111298">https://docs.house.gov/Committee/Calendar/ByEvent.aspx?EventID=111298</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sasson</surname><given-names>H</given-names> </name><name name-style="western"><surname>Mesch</surname><given-names>G</given-names> </name></person-group><article-title>Parental mediation, peer norms and risky online behavior among adolescents</article-title><source>Comput Human Behav</source><year>2014</year><month>04</month><volume>33</volume><fpage>32</fpage><lpage>38</lpage><pub-id pub-id-type="doi">10.1016/j.chb.2013.12.025</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Radovic</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gmelin</surname><given-names>T</given-names> </name><name name-style="western"><surname>Stein</surname><given-names>BD</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>E</given-names> </name></person-group><article-title>Depressed adolescents&#x2019; positive and negative use of social media</article-title><source>J Adolesc</source><year>2017</year><month>02</month><volume>55</volume><fpage>5</fpage><lpage>15</lpage><pub-id pub-id-type="doi">10.1016/j.adolescence.2016.12.002</pub-id><pub-id pub-id-type="medline">27997851</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blankenship</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>St. Surin</surname><given-names>O</given-names> </name></person-group><article-title>Silent voices: the perception of cyberbullying among at-risk middle school students</article-title><source>Int J Cyber Behav Psychol Learn</source><year>2019</year><month>10</month><volume>9</volume><issue>4</issue><fpage>1</fpage><lpage>21</lpage><pub-id pub-id-type="doi">10.4018/IJCBPL.2019100101</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Queir&#x00F3;s</surname><given-names>A</given-names> </name><name name-style="western"><surname>Faria</surname><given-names>D</given-names> </name><name name-style="western"><surname>Almeida</surname><given-names>F</given-names> </name></person-group><article-title>Strengths and limitations of qualitative and quantitative research methods</article-title><source>Zenodo</source><year>2017</year><month>09</month><day>7</day><pub-id pub-id-type="doi">10.5281/ZENODO.887089</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Menesini</surname><given-names>E</given-names> </name><name name-style="western"><surname>Nocentini</surname><given-names>A</given-names> </name></person-group><article-title>Cyberbullying definition and measurement: Some critical considerations</article-title><source>J Psychol</source><year>2009</year><month>01</month><volume>217</volume><issue>4</issue><fpage>230</fpage><lpage>232</lpage><pub-id pub-id-type="doi">10.1027/0044-3409.217.4.230</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Akbulut</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sahin</surname><given-names>YL</given-names> </name><name name-style="western"><surname>Eristi</surname><given-names>B</given-names> </name></person-group><article-title>Development of a scale to investigate cybervictimization among online social utility members</article-title><source>Contemp Educ Technol</source><year>2010</year><volume>1</volume><issue>1</issue><pub-id pub-id-type="doi">10.30935/cedtech/5961</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sridhar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pandey</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hasan</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Mohler</surname><given-names>G</given-names> </name></person-group><article-title>Investigate transitions into drug addiction through text mining of Reddit data</article-title><conf-name>KDD &#x2019;19: The 25th ACM SIGKDD Conference on Knowledge Discovery and Data Mining</conf-name><conf-date>Aug 4-8, 2019</conf-date><conf-loc>Anchorage AK USA</conf-loc><fpage>2367</fpage><lpage>2375</lpage><pub-id pub-id-type="doi">10.1145/3292500.3330737</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>WR</given-names> </name><name name-style="western"><surname>Gesselman</surname><given-names>AN</given-names> </name><name name-style="western"><surname>Garcia</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Groves</surname><given-names>D</given-names> </name><name name-style="western"><surname>Buelow</surname><given-names>JM</given-names> </name></person-group><article-title>Epilepsy-related romantic and sexual relationship problems and concerns: indications from internet message boards</article-title><source>Epilepsy Behav</source><year>2017</year><month>09</month><volume>74</volume><fpage>149</fpage><lpage>153</lpage><pub-id pub-id-type="doi">10.1016/j.yebeh.2017.06.023</pub-id><pub-id pub-id-type="medline">28756337</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sivagurunathan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Walton</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Packham</surname><given-names>T</given-names> </name><name name-style="western"><surname>Booth</surname><given-names>RG</given-names> </name><name name-style="western"><surname>MacDermid</surname><given-names>JC</given-names> </name></person-group><article-title>Discourses around male IPV related systemic biases on Reddit</article-title><source>J Interpers Violence</source><year>2022</year><month>10</month><volume>37</volume><issue>19-20</issue><fpage>NP17834</fpage><lpage>NP17859</lpage><pub-id pub-id-type="doi">10.1177/08862605211030015</pub-id><pub-id pub-id-type="medline">34251276</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>WR</given-names> </name><name name-style="western"><surname>Groves</surname><given-names>D</given-names> </name><name name-style="western"><surname>Knopf</surname><given-names>A</given-names> </name><name name-style="western"><surname>Otte</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Silverman</surname><given-names>RD</given-names> </name></person-group><article-title>Word adjacency graph modeling: separating signal from noise in big data</article-title><source>West J Nurs Res</source><year>2017</year><month>01</month><volume>39</volume><issue>1</issue><fpage>166</fpage><lpage>185</lpage><pub-id pub-id-type="doi">10.1177/0193945916670363</pub-id><pub-id pub-id-type="medline">27655959</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><article-title>Reddit content policy</article-title><source>Reddit Inc</source><year>2018</year><access-date>2023-11-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.redditinc.com/policies/content-policy">https://www.redditinc.com/policies/content-policy</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>Reddit homepage</article-title><source>Reddit Inc</source><year>2023</year><access-date>2023-11-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.redditinc.com/">https://www.redditinc.com/</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Dixon</surname><given-names>S</given-names> </name></person-group><article-title>US reddit app users by age 2021</article-title><year>2022</year><access-date>2023-01-01</access-date><publisher-name>Statista</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.statista.com/statistics/1125159/reddit-us-app-users-age/">https://www.statista.com/statistics/1125159/reddit-us-app-users-age/</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sowles</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>McLeary</surname><given-names>M</given-names> </name><name name-style="western"><surname>Optican</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A content analysis of an online pro-eating disorder community on Reddit</article-title><source>Body Image</source><year>2018</year><month>03</month><volume>24</volume><fpage>137</fpage><lpage>144</lpage><pub-id pub-id-type="doi">10.1016/j.bodyim.2018.01.001</pub-id><pub-id pub-id-type="medline">29414146</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arya</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nagappala</surname><given-names>S</given-names> </name><name name-style="western"><surname>Krawczyk</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Meacham</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Bunting</surname><given-names>AM</given-names> </name></person-group><article-title>Fentanyl in pressed oxycodone pills: a qualitative analysis of online community experiences with an emerging drug trend</article-title><source>Subst Use Misuse</source><year>2022</year><volume>57</volume><issue>13</issue><fpage>1940</fpage><lpage>1945</lpage><pub-id pub-id-type="doi">10.1080/10826084.2022.2120365</pub-id><pub-id pub-id-type="medline">36106770</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Overbeek</surname><given-names>D</given-names> </name><name name-style="western"><surname>Janke</surname><given-names>A</given-names> </name></person-group><article-title>360 characteristics of posts of opioid users on Reddit, an online social media forum, an area for improved harm reduction</article-title><source>Ann Emerg Med</source><year>2018</year><month>10</month><volume>72</volume><issue>4</issue><fpage>S142</fpage><pub-id pub-id-type="doi">10.1016/j.annemergmed.2018.08.365</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ogunleye</surname><given-names>JO</given-names> </name></person-group><article-title>The concept of data mining</article-title><source>Data Mining: Concepts and Applications</source><year>2022</year><access-date>2023-07-31</access-date><publisher-name>IntechOpen</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://directory.doabooks.org/handle/20.500.12854/90223">https://directory.doabooks.org/handle/20.500.12854/90223</ext-link></comment><pub-id pub-id-type="other">978-1-83969-267-3</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Manning</surname><given-names>CD</given-names> </name><name name-style="western"><surname>Raghavan</surname><given-names>P</given-names> </name><name name-style="western"><surname>Sch&#x00FC;tze</surname><given-names>H</given-names> </name></person-group><source>Introduction to Information Retrieval</source><year>2008</year><publisher-name>Cambridge University Press</publisher-name><pub-id pub-id-type="other">978-0-521-86571-5</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jhaver</surname><given-names>S</given-names> </name><name name-style="western"><surname>Birman</surname><given-names>I</given-names> </name><name name-style="western"><surname>Gilbert</surname><given-names>E</given-names> </name><name name-style="western"><surname>Bruckman</surname><given-names>A</given-names> </name></person-group><article-title>Human-machine collaboration for content regulation: The case of Reddit automoderator</article-title><source>ACM Trans Comput-Hum Interact</source><year>2019</year><month>07</month><day>19</day><volume>26</volume><issue>5</issue><fpage>1</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1145/3338243</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lebeuf</surname><given-names>C</given-names> </name><name name-style="western"><surname>Storey</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Zagalsky</surname><given-names>A</given-names> </name></person-group><article-title>Software bots</article-title><source>IEEE Softw</source><year>2018</year><month>01</month><volume>35</volume><issue>1</issue><fpage>18</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1109/MS.2017.4541027</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Waltman</surname><given-names>L</given-names> </name><name name-style="western"><surname>van Eck</surname><given-names>NJ</given-names> </name></person-group><article-title>A smart local moving algorithm for large-scale modularity-based community detection</article-title><source>Eur Phys J B</source><year>2013</year><month>11</month><volume>86</volume><issue>11</issue><fpage>471</fpage><pub-id pub-id-type="doi">10.1140/epjb/e2013-40829-0</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><article-title>ChatGPT [large language model]</article-title><source>OpenAI</source><year>2023</year><access-date>2023-08-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://chat.openai.com/chat">https://chat.openai.com/chat</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><article-title>MAXQDA 2020 [computer software]</article-title><source>VERBI Software</source><year>2019</year><access-date>2025-08-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.maxqda.com/">https://www.maxqda.com/</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bastian</surname><given-names>M</given-names> </name><name name-style="western"><surname>Heymann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jacomy</surname><given-names>M</given-names> </name></person-group><article-title>Gephi: An open source software for exploring and manipulating networks</article-title><source>ICWSM</source><year>2009</year><volume>3</volume><issue>1</issue><fpage>361</fpage><lpage>362</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://gephi.org/">https://gephi.org/</ext-link></comment><pub-id pub-id-type="doi">10.1609/icwsm.v3i1.13937</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eysenbach</surname><given-names>G</given-names> </name><name name-style="western"><surname>Till</surname><given-names>JE</given-names> </name></person-group><article-title>Ethical issues in qualitative research on internet communities</article-title><source>BMJ</source><year>2001</year><month>11</month><day>10</day><volume>323</volume><issue>7321</issue><fpage>1103</fpage><lpage>1105</lpage><pub-id pub-id-type="doi">10.1136/bmj.323.7321.1103</pub-id><pub-id pub-id-type="medline">11701577</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>User agreement</article-title><source>Reddit Inc</source><year>2020</year><access-date>2023-11-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.redditinc.com/policies/user-agreement-october-15-2020">https://www.redditinc.com/policies/user-agreement-october-15-2020</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Piper</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>S</given-names> </name></person-group><article-title>Evaluating large language models for narrative topic labeling</article-title><year>2025</year><conf-name>Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities</conf-name><conf-loc>Albuquerque, USA</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2025.nlp4dh-1">https://aclanthology.org/2025.nlp4dh-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2025.nlp4dh-1.25</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Castellanos</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Gomes</surname><given-names>P</given-names> </name><name name-style="western"><surname>Vander Meer</surname><given-names>D</given-names> </name><name name-style="western"><surname>Castillo</surname><given-names>A</given-names> </name></person-group><article-title>Large language models for thematic summarization in qualitative health care research: comparative analysis of model and human performance</article-title><source>JMIR AI</source><year>2025</year><month>04</month><day>4</day><volume>4</volume><issue>1</issue><fpage>e64447</fpage><pub-id pub-id-type="doi">10.2196/64447</pub-id><pub-id pub-id-type="medline">40611510</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Proferes</surname><given-names>N</given-names> </name><name name-style="western"><surname>Jones</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gilbert</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fiesler</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zimmer</surname><given-names>M</given-names> </name></person-group><article-title>Studying Reddit: a systematic overview of disciplines, approaches, methods, and ethics</article-title><source>Soc Media Soc</source><year>2021</year><month>04</month><volume>7</volume><issue>2</issue><fpage>20563051211019004</fpage><pub-id pub-id-type="doi">10.1177/20563051211019004</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Amaya</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bach</surname><given-names>R</given-names> </name><name name-style="western"><surname>Keusch</surname><given-names>F</given-names> </name><name name-style="western"><surname>Kreuter</surname><given-names>F</given-names> </name></person-group><article-title>New data sources in social science research: things to know before working with Reddit data</article-title><source>Soc Sci Comput Rev</source><year>2021</year><month>10</month><volume>39</volume><issue>5</issue><fpage>943</fpage><lpage>960</lpage><pub-id pub-id-type="doi">10.1177/0894439319893305</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="web"><article-title>Social media fact sheet</article-title><source>Pew Research Center</source><year>2021</year><access-date>2023-12-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.pewresearch.org/internet/fact-sheet/social-media/">https://www.pewresearch.org/internet/fact-sheet/social-media/</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Richard</surname><given-names>B</given-names> </name><name name-style="western"><surname>Sivo</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Ford</surname><given-names>RC</given-names> </name><etal/></person-group><article-title>A guide to conducting online focus groups via Reddit</article-title><source>Int J Qual Methods</source><year>2021</year><month>01</month><volume>20</volume><fpage>16094069211012217</fpage><pub-id pub-id-type="doi">10.1177/16094069211012217</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Storozuk</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ashley</surname><given-names>M</given-names> </name><name name-style="western"><surname>Delage</surname><given-names>V</given-names> </name><name name-style="western"><surname>Maloney</surname><given-names>EA</given-names> </name></person-group><article-title>Got bots? Practical recommendations to protect online survey data from bot attacks</article-title><source>TQMP</source><year>2020</year><volume>16</volume><issue>5</issue><fpage>472</fpage><lpage>481</lpage><pub-id pub-id-type="doi">10.20982/tqmp.16.5.p472</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chelli</surname><given-names>M</given-names> </name><name name-style="western"><surname>Descamps</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lavou&#x00E9;</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Hallucination rates and reference accuracy of ChatGPT and Bard for systematic reviews: comparative analysis</article-title><source>J Med Internet Res</source><year>2024</year><month>05</month><day>22</day><volume>26</volume><fpage>e53164</fpage><pub-id pub-id-type="doi">10.2196/53164</pub-id><pub-id pub-id-type="medline">38776130</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Detailed methodological process for data mining.</p><media xlink:href="infodemiology_v5i1e75493_app1.docx" xlink:title="DOCX File, 22 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Number of posts and comments per cluster.</p><media xlink:href="infodemiology_v5i1e75493_app2.png" xlink:title="PNG File, 27 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>List of cluster labels and thematic categorization.</p><media xlink:href="infodemiology_v5i1e75493_app3.docx" xlink:title="DOCX File, 18 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Manual Review Outcomes for GPT-4 Generated Labels</p><media xlink:href="infodemiology_v5i1e75493_app4.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Definitions and examples of GPT-4 label validation categories.</p><media xlink:href="infodemiology_v5i1e75493_app5.docx" xlink:title="DOCX File, 15 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Example analysis and narrative linkages.</p><media xlink:href="infodemiology_v5i1e75493_app6.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Summary of identified themes from thematic analysis.</p><media xlink:href="infodemiology_v5i1e75493_app7.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material></app-group></back></article>