<?xml version="1.0" encoding="iso-8859-1" standalone="no"?>
<!DOCTYPE GmsArticle SYSTEM "http://www.egms.de/dtd/2.0.34/GmsArticle.dtd">
<GmsArticle xmlns:xlink="http://www.w3.org/1999/xlink">
  <MetaData>
    <Identifier>mibe000202</Identifier>
    <IdentifierDoi>10.3205/mibe000202</IdentifierDoi>
    <IdentifierUrn>urn:nbn:de:0183-mibe0002027</IdentifierUrn>
    <ArticleType>Research Article</ArticleType>
    <TitleGroup>
      <Title language="en">Data quality monitoring in clinical and observational epidemiologic studies: the role of metadata and process information</Title>
      <TitleTranslated language="de">Management von Datenqualit&#228;t in klinischen und beobachtenden epidemiologischen Studien: Die Rolle von Metadaten und Prozessinformationen</TitleTranslated>
    </TitleGroup>
    <CreatorList>
      <Creator>
        <PersonNames>
          <Lastname>Richter</Lastname>
          <LastnameHeading>Richter</LastnameHeading>
          <Firstname>Adrian</Firstname>
          <Initials>A</Initials>
          <AcademicTitle>Dr.</AcademicTitle>
        </PersonNames>
        <Address>Institut f&#252;r Community Medicine, Walther-Rathenau-Stra&#223;e 48, 17475 Greifswald, Phone: &#43;49 3834 867710<Affiliation>Institute for Community Medicine, University Medicine Greifswald, Germany</Affiliation></Address>
        <Email>adrian.richter&#64;uni-greifswald.de</Email>
        <Creatorrole corresponding="yes" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Sch&#246;ssow</Lastname>
          <LastnameHeading>Sch&#246;ssow</LastnameHeading>
          <Firstname>Janka</Firstname>
          <Initials>J</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Institute for Community Medicine, University Medicine Greifswald, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Werner</Lastname>
          <LastnameHeading>Werner</LastnameHeading>
          <Firstname>Andr&#233;</Firstname>
          <Initials>A</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Institute for Community Medicine, University Medicine Greifswald, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Schauer</Lastname>
          <LastnameHeading>Schauer</LastnameHeading>
          <Firstname>Birgit</Firstname>
          <Initials>B</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Institute for Community Medicine, University Medicine Greifswald, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Radke</Lastname>
          <LastnameHeading>Radke</LastnameHeading>
          <Firstname>D&#246;rte</Firstname>
          <Initials>D</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Institute for Community Medicine, University Medicine Greifswald, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Henke</Lastname>
          <LastnameHeading>Henke</LastnameHeading>
          <Firstname>J&#246;rg</Firstname>
          <Initials>J</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Institute for Community Medicine, University Medicine Greifswald, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Struckmann</Lastname>
          <LastnameHeading>Struckmann</LastnameHeading>
          <Firstname>Stephan</Firstname>
          <Initials>S</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Institute for Community Medicine, University Medicine Greifswald, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
      <Creator>
        <PersonNames>
          <Lastname>Schmidt</Lastname>
          <LastnameHeading>Schmidt</LastnameHeading>
          <Firstname>Carsten Oliver</Firstname>
          <Initials>CO</Initials>
        </PersonNames>
        <Address>
          <Affiliation>Institute for Community Medicine, University Medicine Greifswald, Germany</Affiliation>
        </Address>
        <Creatorrole corresponding="no" presenting="no">author</Creatorrole>
      </Creator>
    </CreatorList>
    <PublisherList>
      <Publisher>
        <Corporation>
          <Corporatename>German Medical Science GMS Publishing House</Corporatename>
        </Corporation>
        <Address>D&#252;sseldorf</Address>
      </Publisher>
    </PublisherList>
    <SubjectGroup>
      <SubjectheadingDDB>610</SubjectheadingDDB>
      <Keyword language="en">data quality</Keyword>
      <Keyword language="en">metadata</Keyword>
      <Keyword language="en">process variables</Keyword>
      <Keyword language="en">data monitoring</Keyword>
      <Keyword language="en">health research</Keyword>
      <Keyword language="en">cohort studies</Keyword>
      <Keyword language="de">Datenqualit&#228;t</Keyword>
      <Keyword language="de">Metadaten</Keyword>
      <Keyword language="de">Prozessvariablen</Keyword>
      <Keyword language="de">Datenmonitoring</Keyword>
      <Keyword language="de">Gesundheitsforschung</Keyword>
      <Keyword language="de">Kohortenstudien</Keyword>
    </SubjectGroup>
    <DatePublishedList>
      
    <DatePublished>20191108</DatePublished></DatePublishedList>
    <Language>engl</Language>
    <License license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/4.0/">
      <AltText language="en">This is an Open Access article distributed under the terms of the Creative Commons Attribution 4.0 License.</AltText>
      <AltText language="de">Dieser Artikel ist ein Open-Access-Artikel und steht unter den Lizenzbedingungen der Creative Commons Attribution 4.0 License (Namensnennung).</AltText>
    </License>
    <SourceGroup>
      <Journal>
        <ISSN>1860-9171</ISSN>
        <Volume>15</Volume>
        <Issue>1</Issue>
        <JournalTitle>GMS Medizinische Informatik, Biometrie und Epidemiologie</JournalTitle>
        <JournalTitleAbbr>GMS Med Inform Biom Epidemiol</JournalTitleAbbr>
      </Journal>
    </SourceGroup>
    <ArticleNo>08</ArticleNo>
    <Fundings>
      <Funding>Ministerium f&#252;r Bildung, Wissenschaft und Kultur Mecklenburg-Vorpommern</Funding>
      <Funding fundId="ESF/UG 11 035A">EC</Funding>
      <Funding fundId="SCHM 2744/3-1">Deutsche Forschungsgemeinschaft (DFG)</Funding>
    </Fundings>
  </MetaData>
  <OrigData>
    <Abstract language="de" linked="yes"><Pgraph>Eine hohe Datenqualit&#228;t ist eine wesentliche Voraussetzung f&#252;r valide Entscheidungen in der Gesundheitsforschung. Metadaten bzw. &#8222;Daten &#252;ber andere Daten&#8220; sind f&#252;r die Implementierung eines Datenqualit&#228;tsmonitorings essentiell. Klare Empfehlungen und Benennungen von Metadaten f&#252;r spezifische Aspekte von Datenqualit&#228;t werden in relevanter Literatur jedoch nicht gegeben. Gleichfalls ist nicht klar, welche Informationen &#252;ber den datengenerierenden Prozess gesammelt werden sollten, um Studiendesign und -durchf&#252;hrung zu verbessern. In dieser Arbeit wird unter konzeptioneller Perspektive ein &#220;berblick zu Metadaten und Prozessinformationen gegeben, welche in der Kohortenstudie Study of Health in Pomerania (SHIP) verwendet werden. Zur&#252;ckliegend wurde in SHIP das allgemein gebr&#228;uchliche <Mark2>Data Dictionary</Mark2> um Informationen erweitert, welche f&#252;r Datenqualit&#228;tsbewertungen verwendet werden und diese auch steuern k&#246;nnen; bis zu 20 unterschiedliche Charakteristika von Variablen k&#246;nnen spezifiziert werden. Konzeptionell werden hierf&#252;r statische von variablen Metadaten sowie Prozessvariablen unterschieden. Zum Beispiel sind die Verteilungsform, Plausibilit&#228;ts- und Zul&#228;ssigkeitsgrenzen sowie der Dateneingabetyp statische Metadaten. Variierende Referenzgrenzen von z.B. Laborparametern werden als variable Metadaten betrachtet. Diese Information erlaubt die Identifizierung von Beeintr&#228;chtigungen der Datenqualit&#228;t durch einen Vergleich von beobachteten und erwarteten Charakteristika der Daten. Prozessvariablen wie die ID des Untersuchers oder des Messger&#228;ts erlauben hingegen die Identifikation von m&#246;glichen Quellen f&#252;r Fehler, selbst wenn keine Metadaten verletzt wurden. Metadaten und Prozessvariablen k&#246;nnen jeweils allein oder in Kombination verwendet werden, um vielseitige und effiziente Qualit&#228;tsbewertungen umzusetzen. Die Erstellung notwendiger Metadaten und die Definition von Prozessvariablen bedeuten einen erheblichen Aufwand, insbesondere f&#252;r gr&#246;&#223;ere Studien. Der Zugewinn an Transparenz und Effektivit&#228;t bei der Qualit&#228;tsberichterstellung ist jedoch erheblich.</Pgraph></Abstract>
    <Abstract language="en" linked="yes"><Pgraph>High data quality is fundamental for valid inferences in health research. Metadata, i.e. &#8220;data that describe other data&#8221;, are essential to implement data quality assessments but more guidance on which metadata to use is needed. Similarly, the selection and use of variables describing the measurement process should be exemplified to improve the design and conduct of observational health studies. This work provides a conceptual framework and overview of metadata and process information for systematic data quality reports based on implementations within the population-based cohort Study of Health in Pomerania (SHIP). In previous years, a prerequisite for automated data quality checks has been established by the augmentation of the data dictionary; the added information of up to 20 different characteristics for each variable is used for data quality assessments and triggers diverse data quality checks. Conceptually we distinguish static metadata, variable metadata, and process variables. Examples for static metadata are the expected probability distribution, plausibility limits, and the data type. Variable metadata may be reference limits of a laboratory marker. Information inherent to these metadata allows for the detection of data quality flaws by comparing observed with expected data characteristics. In contrast, process variables, such as the observer or device ID, also allow for the identification of sources of data quality issues. This is the case even if characteristics defined in metadata were not violated. Metadata and process variables can be used alone or in combination to implement a versatile and efficient data quality assessment. A comprehensive setup of metadata and process variables is an extensive task, particularly in studies involving large data collections. Nonetheless, the gain in transparency and efficacy of data curation and quality reporting after this setup is considerable.</Pgraph></Abstract>
    <TextBlock linked="yes" name="Introduction">
      <MainHeadline>Introduction</MainHeadline><Pgraph>Metadata is considered as &#8220;data that describe other data&#8221; <TextLink reference="1"></TextLink>. It plays a key role for the assessment of data quality in different scientific disciplines. Definitions and use of metadata are manifold <TextLink reference="2"></TextLink>, <TextLink reference="3"></TextLink>, <TextLink reference="4"></TextLink>, <TextLink reference="5"></TextLink>, <TextLink reference="6"></TextLink>. In health research, metadata may cover conceptual aspects such as descriptions of the sampling scheme of a study, or it can relate to specific characteristics of single measurement variables <TextLink reference="7"></TextLink> such as the variable name, plausibility limits, or the data type. Most software for either electronic data capture or data quality assessments such as RedCap <TextLink reference="8"></TextLink>, Square&#178; <TextLink reference="9"></TextLink> or OPAL <TextLink reference="10"></TextLink> make systematic use of metadata. A German guideline on data quality in medical research also presumes an existing metadata concept <TextLink reference="11"></TextLink>. However, these and other works <TextLink reference="12"></TextLink>, <TextLink reference="13"></TextLink>, <TextLink reference="14"></TextLink>, <TextLink reference="15"></TextLink> do not provide clear guidance on the extent, structure and use of metadata for systematic data quality assessments. </Pgraph><Pgraph>More attention also needs to be given to assessments of the data generation process. Methods from statistical process control and industrial statistics suggest considering factors that might affect the data generating process. Respective factors are called <Mark2>process variables</Mark2> and are systematically controlled in designs of experiments <TextLink reference="16"></TextLink>. Similarly to manufacturers and engineers, principal investigator (PIs) and scientists of observational studies with primary data collections have control over the data generating process. This characteristic differentiates primary from secondary data collections and enables for interventions during ongoing studies. </Pgraph><Pgraph>Accordingly, adequate assessments of data quality in health studies should make use of metadata and carefully monitor the process under which measurements are obtained. A simple use case illustrates this necessity. In the population-based Study of Health in Pomerania (SHIP) <TextLink reference="17"></TextLink> participants are examined by different examiners in a dedicated center including the drawing of blood samples to determine for example c-reactive protein (CRP). A missing CRP value may have, among others, the following reasons for missingness: the actual value was below the detection limit of a device, a participant refused to provide a blood sample, or the examination was aborted. Related process information are the examiner, the time of the day, the transporting time elapsed between drawing of the blood sample and the final storage in the biorepository. Recording and investigating the frequencies of reasons for missing values in combination with associated process variables may point at possible targets of intervention, e.g. a training of examiners or the re-calibration of devices. </Pgraph><Pgraph>This work provides an overview of metadata and process variables along with conceptual considerations to support the implementation of systematic and automated data quality assessments based on our experience in the SHIP study. </Pgraph></TextBlock>
    <TextBlock linked="yes" name="Methods">
      <MainHeadline>Methods</MainHeadline><Pgraph>The methodological background for this work originates from two decades of experience with data management and data monitoring in the Study of Health in Pomerania (SHIP) <TextLink reference="18"></TextLink>. The SHIP study comprises two cohorts (SHIP and SHIP-TREND) with in total 8,728 participants. To date, four SHIP and two SHIP-TREND waves have been completed. More than 40,000 variables originate from computer-assisted personal interviews, self-reported questionnaires, biomaterials (blood, urine, faeces, saliva), imaging data (e.g. ultrasound and MRI), and a wide range of clinical examinations, including dental, dermatological, and cardiovascular measurements. Each electronic case report form (eCRF) in SHIP is based on metadata and collects process information. In addition, OMICS data complement the data collection, as well as subsequent secondary assessments, e.g. readings of magnetic resonance images. </Pgraph><Pgraph>Quality management within SHIP rests upon the storage of study data and metadata in a central data repository. For this purpose a PostgreSQL database backend is used <TextLink reference="19"></TextLink>. Web applications support the creation of data <TextGroup><PlainText>dictionary</PlainText></TextGroup> elements (Shipdesigner), and electronic data capture (Shippie). The former is used for metadata setup which is used by the latter to control for errors in the data entry process. Subsequently, routines in SAS and in dedicated data quality assessment environments (SQuaRe, Square&#178;) make use of metadata and process variables to conduct data quality checks <TextLink reference="9"></TextLink>, <TextLink reference="19"></TextLink>, <TextLink reference="20"></TextLink>. </Pgraph><Pgraph>This work summarizes metadata and process variables utilized in these applications and structures them (i) according to the type of input data and (ii) the data quality dimensions completeness and correctness. These data quality dimensions are sometimes referred to as intrinsic data quality <TextLink reference="21"></TextLink>, <TextLink reference="22"></TextLink>, i.e. data quality which can be evaluated without the use of contextual information such as a specific research question.</Pgraph></TextBlock>
    <TextBlock linked="yes" name="Results">
      <MainHeadline>Results</MainHeadline><SubHeadline>Data structure and terminology</SubHeadline><Pgraph>Our approach considers the relations of <Mark2>study data</Mark2> and <Mark2>metadata</Mark2>. Study data comprise identifiers for observational units, the clinical measurements, variables describing the process under which the data were collected and in some cases varying metadata. Each column of the study data contains varying data values or missings. </Pgraph><Pgraph>Pre-defined static characteristics apply for each column in the study data, such as a label, a value list or the data type, form the basis of static metadata on the variable level and may be stored in various forms. One option as implemented and used in SHIP is depicted in Figure 1 <ImgLink imgNo="1" imgType="figure"/>, where the characteristics related to columns of the study data are stored in a separate table of static metadata. In SHIP each column of the study data is identifiable over a &#8220;key&#8221; which is defined in the static metadata.</Pgraph><SubHeadline>Metadata for data quality assessments</SubHeadline><Pgraph>Metadata used for data quality assessments describe desired or expected properties of the data <TextLink reference="23"></TextLink> as well as additional information, for example, semantic annotation of variables based on uniform codes <TextLink reference="24"></TextLink>. Each column of the study data has assigned static properties which are valid for the life cycle of the respective health research study <TextLink reference="8"></TextLink>. Ideally, relevant metadata are defined before the data collections starts. Typical static metadata are the variable name and the data type (Figure 1 <ImgLink imgNo="1" imgType="figure"/>, top right panel). Further examples are shown in Table 1 <ImgLink imgNo="1" imgType="table"/>. Such descriptive characteristics are usually denoted in the data dictionary (DD) of most studies.</Pgraph><Pgraph>In some cases applicable metadata may also vary across observations. For example, the detection limit of a new device has changed or reference limits of laboratory markers vary in a long-term study. In this case, a metadata variable needs to be included in the study data to assign varying reference limits to the target measurement (Figure 1 <ImgLink imgNo="1" imgType="figure"/>, left panel). The link from the measurement variable (CRP) to the respective metadata variable (RefLimits&#95;v101) is defined via an own column in the static metadata. The top right panel of Figure 1 <ImgLink imgNo="1" imgType="figure"/> mentions <Mark2>key&#95;ref&#95;limits</Mark2> which specifies the key of the variable containing the time-varying reference limits for CRP. Similar columns are denoted as key-columns which point to the associated metadata variable. Such structural information is required to implement automated procedures of data quality assessments.</Pgraph><Pgraph>Metadata may comprise information related to different aspects of data quality:</Pgraph><Pgraph><UnorderedList><ListItem level="1">data completeness (e.g. reasons for missing data, such as conditionally missing data in the category &#8220;birth complications&#8221; for males)</ListItem><ListItem level="1">data correctness (e.g. value lists, detection limits, admissible values, plausibility limits)</ListItem><ListItem level="1">the selection of statistical approaches to data quality checks (e.g. data types, distributional class)</ListItem></UnorderedList></Pgraph><Pgraph>In addition to the investigation of data characteristics a coherent and readable reporting is important. Particularly in studies involving thousands of study data variables the presentation quality is challenging but essential to impede misinterpretation <TextLink reference="25"></TextLink>. Therefore, further static metadata such as labels and units of measurement can be defined to ensure a readable and standardized output. For example, the assignment of fixed colors for examiners or devices is recommended across graphical outputs (not shown in Figure 1). </Pgraph><SubHeadline>Process variables for data quality assessments</SubHeadline><Pgraph>Measurements in health research are vulnerable to various sources of distortion. Environmental conditions as well as examiners and devices may change over time. Process variables are needed to capture such information <TextLink reference="16"></TextLink> along with the measurements in study data. Insofar process variables can be considered measurements themselves and may relate to:</Pgraph><Pgraph><UnorderedList><ListItem level="1">study conduct (e.g. observer, device ID, location)</ListItem><ListItem level="1">environmental conditions (e.g. examination times, processing times, room temperature, humidity).</ListItem></UnorderedList></Pgraph><Pgraph>Examples of process variables utilized in SHIP are provided in Table 2 <ImgLink imgNo="2" imgType="table"/>. Each eCRF in SHIP prompts the recording of such information, for example, regarding ultrasound examination of the thyroid (<Hyperlink href="https:&#47;&#47;medical-data-models.org&#47;30755">https:&#47;&#47;medical-data-models.org&#47;30755</Hyperlink>) <TextLink reference="26"></TextLink> almost 25&#37; of recorded variables comprise process information. The definition and identification of relevant process variables rests on appropriate background knowledge about factors that may influence the measurements. Their implementation in the measurement process may impose considerable additional efforts for the study conduct: additional measurement devices might be required with all related data quality management logistics. </Pgraph><Pgraph>There is a crucial difference regarding the definition and application of process variables and metadata: while metadata can be defined even after the data collection has been finished this is very difficult or impossible for process variables. Process variables should be identified and implemented prior to the start of a data collection to avoid missing and unrecoverable process information. Process information themselves are measurements of the conditions under which study data were generated. </Pgraph><SubHeadline>Use of metadata for data quality assessments </SubHeadline><Pgraph>Data quality assessments use metadata to investigate the compliance of observed data with expected properties <TextLink reference="20"></TextLink>, <TextLink reference="23"></TextLink>. For example, if a categorical variable has per design four distinct values (Figure 1 <ImgLink imgNo="1" imgType="figure"/>, &#8220;value&#95;list&#8221;) and the data show five, at least one invalid data value has been observed. Such data quality checks, also referred to as edit-, range- or cross checks <TextLink reference="27"></TextLink>, predominantly focus on the evaluation of entries in single data fields, i.e. each data field is checked against the desired properties of the data as coded in metadata. This means, for example regarding CRP in Figure 1 <ImgLink imgNo="1" imgType="figure"/>, missing codes can be tabulated to infer on reasons for unavailable measurements (completeness), those being smaller than zero (inadmissibility) and those being greater than five (plausibility) are counted or flagged as potential correctness issues.</Pgraph><Pgraph>In the SHIP workflow, initial data quality checks for missing values and correctness predominantly rely on static metadata. This comprises checks during data entry in the Shippie electronic case reporting forms (eCRFs). After data capture, automated data quality controls are conducted based on SAS routines and batch jobs <TextLink reference="19"></TextLink>. These checks are routinely conducted every night for the entire ongoing data collection. Feedback on issues is obtained through an MS Access data entry mask for each flagged data quality issue to ensure a timely response by the responsible quality manager. Only deviations from static metadata will be encountered at this stage, although some measurements might be inaccurate without violating predefined properties. </Pgraph><SubHeadline>Use of process variables for data quality assessments</SubHeadline><Pgraph>Data quality may be impaired although, according to metadata, formal discrepancies between observed data and expected properties are absent. Common examples are observer or device effects which can be impossible to detect in the overall distribution of a measurement. Process variables allow for the detection of data quality issues and their possible sources. The main focus of data quality assessments using process variables is their association with distributional characteristics of measurements. For example, very high room temperatures may explain lower performances in a spiroergometry. Seasonal changes in outcome variables that were identified using the examination date might be explainable this way. </Pgraph><Pgraph>Process variables are also required to assess the compliance with procedural rules, e.g. the analysis of sufficient resting time before a blood pressure measurement starts. Therefore, process variables may be automatically stored by recording start and end time of examinations. Other process variables can be used to check for appropriate ambient conditions under which measurements took place. For example, does the size of an arm cuff used for blood pressure measurement correspond with participants&#8217; arm circumference. These examples illustrate the different use of process variables compared to metadata for data quality assessments. </Pgraph><Pgraph>In the SHIP workflow, data quality issues measured by process variables are the main target of web applications (Square&#178;) dedicated to data quality assessment <TextLink reference="9"></TextLink>, <TextLink reference="20"></TextLink>. Related reports are generated semi-automatically in defined intervals or on demand, using pdf as output format. Encountered issues are the basis for systematic feedbacks to the SHIP examination team and may trigger trainings. Contrary to the use of metadata for checks of single data fields, the use of process variables faces an important limitation. They require a sufficient number of cases to reliably detect data quality issues such as observer differences or time trends. </Pgraph><SubHeadline>Combined use of metadata and process variables for data quality assessments</SubHeadline><Pgraph>The combination of metadata and process variables enables for versatile data quality assessments. Univariate analyses may reveal disproportional numbers of missings or implausible measurements. In combination with process variables such as examiner, device, or preprocessing characteristics, a potential error source may be identified. For example, the measurement variable CRP in <TextGroup><PlainText>Figure 1 </PlainText></TextGroup><ImgLink imgNo="1" imgType="figure"/> has two data values representing missing codes; one code has the denotation &#8220;polluted sample material&#8221;. If such a missing code occurs frequently, the source should be investigated. For example, the pre-processing of probes in the laboratory or the handling of probes during&#47;after blood drawing may cause a contamination of samples. </Pgraph></TextBlock>
    <TextBlock linked="yes" name="Discussion">
      <MainHeadline>Discussion</MainHeadline><Pgraph>This work provides an overview of metadata and process variables to monitor and improve data quality of observational studies. Accompanying conceptual considerations differentiate the features of static and variable metadata as well as process variables to support their handling in data quality assessments. The use of metadata in health research studies is crucial to follow guidelines <TextLink reference="11"></TextLink> and to use metadata driven quality control with web applications such as RedCap, Square&#178; or OPAL <TextLink reference="8"></TextLink>, <TextLink reference="9"></TextLink>, <TextLink reference="10"></TextLink>. Similarly, process variables which were introduced from industrial statistics <TextLink reference="16"></TextLink> are essential in health research studies with primary data collections, since varying conditions of the measurement process might distort the quality of the data. </Pgraph><Pgraph>The appropriate consideration of metadata and process variables may appear straightforward but in complex studies the setup is likely to be challenging and time consuming. Assigning unambiguous and understandable labels for thousands of variables requires consistent checks of the DD. In this context the use of unambiguous semantic annotation is particularly beneficial. For example, by unambiguous UMLS codes <TextLink reference="24"></TextLink> which have been assigned to SHIP variables in a cooperation with the Portal for Medical Data Models (MDM) <TextLink reference="26"></TextLink> to improve harmonized comparisons across studies. Some decisions for and definitions of static metadata are only possible with the intended outcome of the data quality reports in mind. Furthermore, it might be a matter of debate which plausibility or admissibility limits to define in a given study. However, starting a study with imperfect limits and comparing the data with those is more valuable than defining no limits at all, i.e. implementing no checks on measurement limits. </Pgraph><Pgraph>Metadata themselves can be a gateway of data quality flaws. For example, the static metadata <Mark2>value list</Mark2> and <Mark2>missing codes</Mark2> should be separate sets of values for each study variable, otherwise script-based routines might fail or the output of quality reports gets odd. Therefore, the coherent definition of data characteristics as metadata may consume considerable efforts if conducted for several thousand variables. However, augmenting the data dictionary with only some information required for data quality assessments (e.g. limits) already enables for essential data quality checks, particularly in larger and long-lasting studies.</Pgraph><Pgraph>The introduction and use of process variables for data quality assessments is of utmost importance. They may guide to means of interventions in the data generating process. A systematic understanding and selection of relevant process variables may require a review of existing literature because each examination needs to be considered independently. The collection of process variables can be elaborated and may add to the costs of a study. For example, monitoring ambient conditions in each examination room requires additional equipment and data base extensions, along with extensions of the data base and potentially the eCRFs. However, omitting the use of process variables eliminates one major advantage of primary data collections for data quality management: to trace back sources of errors and to regain control over a data generating process by close monitoring of this process.</Pgraph><Pgraph>The overview provided in this work has some limitations. Presented static and variable metadata are not comprehensive regarding other types of data (e.g. OMICS) <TextLink reference="28"></TextLink>. We also omitted the reflection of longitudinal aspects of data quality assessments. In fact, some could be easily implemented into the presented concept, e.g. another static metadata may link different measurement variables for correctness checks such as &#8220;is age at follow-up higher than at baseline&#8221;. However, longitudinal issues have special requirements regarding the format of the data (wide vs. long) and may require more complex statistical techniques for their assessment. Another limitation rests with the use of semantic annotation for study data variables. Such unambiguous codes facilitate correct interpretation of data but are currently not used for data quality assessments. Worthy of note is also the restricted use case for metadata presented in this work. Metadata are of importance beyond this application, for example, for the selection of data bases with similar populations and study focus.</Pgraph><Pgraph>High data quality means inherently that data should be fit for use. Completeness and correctness do not entirely account for this demand. Additional contextual information is necessary to evaluate the achieved data quality with respect to the intended use. For example, are all variables of interest available in a study with a sufficient sample size to analyze the effects of inflammation markers on back pain&#63; Contextual information varies strongly with the research questions and is difficult to implement into a standardized metadata concept. Varying contextual information may also lead to different conclusions regarding the obtained data quality for the same data collection <TextLink reference="22"></TextLink>. For example, sampling errors may impair the representativeness of data. This is of importance if we are interested in the prevalence of population based risk factors. However, it may be less important if we are interested only in associations of risk factors. </Pgraph><Pgraph>Many aspects regarding the utility of metadata and process information provided in this overview are likely to be well known. However, comprehensive overviews are lacking and, in practice, their use seems inconsistent. This has contributed to critics regarding the transparency and reproducibility of research findings <TextLink reference="29"></TextLink>. The provided overview may assist in the setup of data dictionaries for new studies or the augmentation of data dictionaries for existing studies. In particular smaller studies and those under development may profit from this overview in terms of transparency and several options for data quality management. Adding data quality related metadata to the DD provides an overview of applicable data quality checks. The largest gains in efficiency regarding the generation of data quality reports will be noticeable by larger, long-lasting studies requiring repeated data quality reporting.</Pgraph><Pgraph>Although metadata and process variables should be defined prior to the data collection, many pitfalls and concept flaws may only become obvious during data collection and after the system has gone productive. Therefore, improving the quality of primary health data may require adaptations of the metadata concept or the selection and measurement of process variables throughout the study.</Pgraph></TextBlock>
    <TextBlock linked="yes" name="Notes">
      <MainHeadline>Notes</MainHeadline><SubHeadline>Competing interests</SubHeadline><Pgraph>The authors declare that they have no competing interests.</Pgraph><SubHeadline>Funding</SubHeadline><Pgraph>The development of work underlying this paper was supported by the Ministry for Education, Science and Culture of the State of Mecklenburg-Vorpommern, the European Social Fund (Grant UG 11 035A), and by the German Research Foundation (DFG, SCHM 2744&#47;3-1).</Pgraph></TextBlock>
    <References linked="yes">
      <Reference refNo="1">
        <RefAuthor>Nadkarni PM</RefAuthor>
        <RefTitle>What Is Metadata&#63;</RefTitle>
        <RefYear>2011</RefYear>
        <RefBookTitle>Metadata-driven software systems in biomedicine: designing systems that can adapt to changing knowledge</RefBookTitle>
        <RefPage>1-16</RefPage>
        <RefTotal>Nadkarni PM. What Is Metadata&#63; In: Metadata-driven software systems in biomedicine: designing systems that can adapt to changing knowledge. London, New York: Springer; 2011. (Health informatics). p. 1-16. DOI: 10.1007&#47;978-0-85729-510-1&#95;1</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1007&#47;978-0-85729-510-1&#95;1</RefLink>
      </Reference>
      <Reference refNo="2">
        <RefAuthor>Schuurman N</RefAuthor>
        <RefAuthor>Leszczynski A</RefAuthor>
        <RefTitle>Ontology-based metadata</RefTitle>
        <RefYear>2006</RefYear>
        <RefJournal>Trans GIS</RefJournal>
        <RefPage>709-26</RefPage>
        <RefTotal>Schuurman N, Leszczynski A. Ontology-based metadata. Trans GIS. 2006;10(5):709-26. DOI: 10.1111&#47;j.1467-9671.2006.01024.x</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1111&#47;j.1467-9671.2006.01024.x</RefLink>
      </Reference>
      <Reference refNo="3">
        <RefAuthor>Vardigan M</RefAuthor>
        <RefAuthor>Heus P</RefAuthor>
        <RefAuthor>Thomas W</RefAuthor>
        <RefTitle>Data documentation initiative: Toward a standard for the social sciences</RefTitle>
        <RefYear>2008</RefYear>
        <RefJournal>Int J Digit Curation</RefJournal>
        <RefPage>107-13</RefPage>
        <RefTotal>Vardigan M, Heus P, Thomas W. Data documentation initiative: Toward a standard for the social sciences. Int J Digit Curation. 2008;3(1):107-13. DOI: 10.2218&#47;ijdc.v3i1.45</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.2218&#47;ijdc.v3i1.45</RefLink>
      </Reference>
      <Reference refNo="4">
        <RefAuthor>Vardaki M</RefAuthor>
        <RefAuthor>Papageorgiou H</RefAuthor>
        <RefAuthor>Pentaris F</RefAuthor>
        <RefTitle>A statistical metadata model for clinical trials&#8217; data management</RefTitle>
        <RefYear>2009</RefYear>
        <RefJournal>Comput Methods Programs Biomed</RefJournal>
        <RefPage>129-45</RefPage>
        <RefTotal>Vardaki M, Papageorgiou H, Pentaris F. A statistical metadata model for clinical trials&#8217; data management. Comput Methods Programs Biomed. 2009 Aug;95(2):129-45. DOI: 10.1016&#47;j.cmpb.2009.02.004</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1016&#47;j.cmpb.2009.02.004</RefLink>
      </Reference>
      <Reference refNo="5">
        <RefAuthor>Hughes B</RefAuthor>
        <RefTitle>Metadata Quality Evaluation: Experience from the Open Language Archives Community</RefTitle>
        <RefYear>2004</RefYear>
        <RefBookTitle>Digital Libraries: International Collaboration and Cross-Fertilization. International Conference on Asian Digital Libraries</RefBookTitle>
        <RefPage>320-9</RefPage>
        <RefTotal>Hughes B. Metadata Quality Evaluation: Experience from the Open Language Archives Community. In: Chen Z, Chen H, Miao Q, Fu Y, Fox E, Lim E, editors. Digital Libraries: International Collaboration and Cross-Fertilization.  International Conference on Asian Digital Libraries. Berlin, Heidelberg: Springer; 2004. p. 320-9. DOI: 10.1007&#47;978-3-540-30544-6&#95;34</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1007&#47;978-3-540-30544-6&#95;34</RefLink>
      </Reference>
      <Reference refNo="6">
        <RefAuthor>Huebner M</RefAuthor>
        <RefAuthor>Le Cessie S</RefAuthor>
        <RefAuthor>Schmidt CO</RefAuthor>
        <RefAuthor>Vach W</RefAuthor>
        <RefTitle>A contemporary conceptual framework for initial data analysis</RefTitle>
        <RefYear>2018</RefYear>
        <RefJournal>Obs Stud</RefJournal>
        <RefPage>71-192</RefPage>
        <RefTotal>Huebner M, Le Cessie S, Schmidt CO, Vach W. A contemporary conceptual framework for initial data analysis. Obs Stud. 2018;4:71-192.</RefTotal>
      </Reference>
      <Reference refNo="7">
        <RefAuthor>Finnie TJ</RefAuthor>
        <RefAuthor>South A</RefAuthor>
        <RefAuthor>Bento A</RefAuthor>
        <RefAuthor>Sherrard-Smith E</RefAuthor>
        <RefAuthor>Jombart T</RefAuthor>
        <RefTitle>EpiJSON: A unified data-format for epidemiology</RefTitle>
        <RefYear>2016</RefYear>
        <RefJournal>Epidemics</RefJournal>
        <RefPage>20-6</RefPage>
        <RefTotal>Finnie TJ, South A, Bento A, Sherrard-Smith E, Jombart T. EpiJSON: A unified data-format for epidemiology. Epidemics. 2016 Jun;15:20-6. DOI: 10.1016&#47;j.epidem.2015.12.002</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1016&#47;j.epidem.2015.12.002</RefLink>
      </Reference>
      <Reference refNo="8">
        <RefAuthor>Harris PA</RefAuthor>
        <RefAuthor>Taylor R</RefAuthor>
        <RefAuthor>Thielke R</RefAuthor>
        <RefAuthor>Payne J</RefAuthor>
        <RefAuthor>Gonzalez N</RefAuthor>
        <RefAuthor>Conde JG</RefAuthor>
        <RefTitle>Research electronic data capture (REDCap) &#8211; a metadata-driven methodology and workflow process for providing translational research informatics support</RefTitle>
        <RefYear>2009</RefYear>
        <RefJournal>J Biomed Inform</RefJournal>
        <RefPage>377-81</RefPage>
        <RefTotal>Harris PA, Taylor R, Thielke R, Payne J, Gonzalez N, Conde JG. Research electronic data capture (REDCap) &#8211; a metadata-driven methodology and workflow process for providing translational research informatics support. J Biomed Inform. 2009 Apr;42(2):377-81. DOI: 10.1016&#47;j.jbi.2008.08.010</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1016&#47;j.jbi.2008.08.010</RefLink>
      </Reference>
      <Reference refNo="9">
        <RefAuthor>Schmidt CO</RefAuthor>
        <RefAuthor>Krabbe C</RefAuthor>
        <RefAuthor>Sch&#246;ssow J</RefAuthor>
        <RefAuthor>Albers M</RefAuthor>
        <RefAuthor>Radke D</RefAuthor>
        <RefAuthor>Henke J</RefAuthor>
        <RefTitle>Square &#8211; A Web Application for Data Monitoring in Epidemiological and Clinical Studies</RefTitle>
        <RefYear>2017</RefYear>
        <RefJournal>Stud Health Technol Inform</RefJournal>
        <RefPage>549-53</RefPage>
        <RefTotal>Schmidt CO, Krabbe C, Sch&#246;ssow J, Albers M, Radke D, Henke J. Square &#8211; A Web Application for Data Monitoring in Epidemiological and Clinical Studies. Stud Health Technol Inform. 2017;235:549-53.</RefTotal>
      </Reference>
      <Reference refNo="10">
        <RefAuthor>Gaye A</RefAuthor>
        <RefAuthor>Marcon Y</RefAuthor>
        <RefAuthor>Isaeva J</RefAuthor>
        <RefAuthor>LaFlamme P</RefAuthor>
        <RefAuthor>Turner A</RefAuthor>
        <RefAuthor>Jones EM</RefAuthor>
        <RefAuthor>Minion J</RefAuthor>
        <RefAuthor>Boyd AW</RefAuthor>
        <RefAuthor>Newby CJ</RefAuthor>
        <RefAuthor>Nuotio ML</RefAuthor>
        <RefAuthor>Wilson R</RefAuthor>
        <RefAuthor>Butters O</RefAuthor>
        <RefAuthor>Murtagh B</RefAuthor>
        <RefAuthor>Demir I</RefAuthor>
        <RefAuthor>Doiron D</RefAuthor>
        <RefAuthor>Giepmans L</RefAuthor>
        <RefAuthor>Wallace SE</RefAuthor>
        <RefAuthor>Budin-Lj&#248;sne I</RefAuthor>
        <RefAuthor>Oliver Schmidt C</RefAuthor>
        <RefAuthor>Boffetta P</RefAuthor>
        <RefAuthor>Boniol M</RefAuthor>
        <RefAuthor>Bota M</RefAuthor>
        <RefAuthor>Carter KW</RefAuthor>
        <RefAuthor>deKlerk N</RefAuthor>
        <RefAuthor>Dibben C</RefAuthor>
        <RefAuthor>Francis RW</RefAuthor>
        <RefAuthor>Hiekkalinna T</RefAuthor>
        <RefAuthor>Hveem K</RefAuthor>
        <RefAuthor>Kval&#248;y K</RefAuthor>
        <RefAuthor>Millar S</RefAuthor>
        <RefAuthor>Perry IJ</RefAuthor>
        <RefAuthor>Peters A</RefAuthor>
        <RefAuthor>Phillips CM</RefAuthor>
        <RefAuthor>Popham F</RefAuthor>
        <RefAuthor>Raab G</RefAuthor>
        <RefAuthor>Reischl E</RefAuthor>
        <RefAuthor>Sheehan N</RefAuthor>
        <RefAuthor>Waldenberger M</RefAuthor>
        <RefAuthor>Perola M</RefAuthor>
        <RefAuthor>van den Heuvel E</RefAuthor>
        <RefAuthor>Macleod J</RefAuthor>
        <RefAuthor>Knoppers BM</RefAuthor>
        <RefAuthor>Stolk RP</RefAuthor>
        <RefAuthor>Fortier I</RefAuthor>
        <RefAuthor>Harris JR</RefAuthor>
        <RefAuthor>Woffenbuttel BH</RefAuthor>
        <RefAuthor>Murtagh MJ</RefAuthor>
        <RefAuthor>Ferretti V</RefAuthor>
        <RefAuthor>Burton PR</RefAuthor>
        <RefTitle>DataSHIELD: taking the analysis to the data, not the data to the analysis</RefTitle>
        <RefYear>2014</RefYear>
        <RefJournal>Int J Epidemiol</RefJournal>
        <RefPage>1929-44</RefPage>
        <RefTotal>Gaye A, Marcon Y, Isaeva J, LaFlamme P, Turner A, Jones EM, Minion J, Boyd AW, Newby CJ, Nuotio ML, Wilson R, Butters O, Murtagh B, Demir I, Doiron D, Giepmans L, Wallace SE, Budin-Lj&#248;sne I, Oliver Schmidt C, Boffetta P, Boniol M, Bota M, Carter KW, deKlerk N, Dibben C, Francis RW, Hiekkalinna T, Hveem K, Kval&#248;y K, Millar S, Perry IJ, Peters A, Phillips CM, Popham F, Raab G, Reischl E, Sheehan N, Waldenberger M, Perola M, van den Heuvel E, Macleod J, Knoppers BM, Stolk RP, Fortier I, Harris JR, Woffenbuttel BH, Murtagh MJ, Ferretti V, Burton PR. DataSHIELD: taking the analysis to the data, not the data to the analysis. Int J Epidemiol. 2014 Dec;43(6):1929-44. DOI: 10.1093&#47;ije&#47;dyu188</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1093&#47;ije&#47;dyu188</RefLink>
      </Reference>
      <Reference refNo="11">
        <RefAuthor>Nonnemacher M</RefAuthor>
        <RefAuthor>Nasseh D</RefAuthor>
        <RefAuthor>Stausberg J</RefAuthor>
        <RefTitle></RefTitle>
        <RefYear>2014</RefYear>
        <RefBookTitle>Datenqualit&#228;t in der medizinischen Forschung</RefBookTitle>
        <RefPage></RefPage>
        <RefTotal>Nonnemacher M, Nasseh D, Stausberg J. Datenqualit&#228;t in der medizinischen Forschung. Berlin: Medizinisch Wissenschaftliche Verlagsgesellschaft; 2014. (TMF &#8211; Technologie- und Methodenplattform).</RefTotal>
      </Reference>
      <Reference refNo="12">
        <RefAuthor>Chen H</RefAuthor>
        <RefAuthor>Hailey D</RefAuthor>
        <RefAuthor>Wang N</RefAuthor>
        <RefAuthor>Yu P</RefAuthor>
        <RefTitle>A review of data quality assessment methods for public health information systems</RefTitle>
        <RefYear>2014</RefYear>
        <RefJournal>Int J Environ Res Public Health</RefJournal>
        <RefPage>5170-207</RefPage>
        <RefTotal>Chen H, Hailey D, Wang N, Yu P. A review of data quality assessment methods for public health information systems. Int J Environ Res Public Health. 2014 May;11(5):5170-207. DOI: 10.3390&#47;ijerph110505170</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.3390&#47;ijerph110505170</RefLink>
      </Reference>
      <Reference refNo="13">
        <RefAuthor>Edwards PN</RefAuthor>
        <RefAuthor>Mayernik MS</RefAuthor>
        <RefAuthor>Batcheller AL</RefAuthor>
        <RefAuthor>Bowker GC</RefAuthor>
        <RefAuthor>Borgman CL</RefAuthor>
        <RefTitle>Science friction: data, metadata, and collaboration</RefTitle>
        <RefYear>2011</RefYear>
        <RefJournal>Soc Stud Sci</RefJournal>
        <RefPage>667-90</RefPage>
        <RefTotal>Edwards PN, Mayernik MS, Batcheller AL, Bowker GC, Borgman CL. Science friction: data, metadata, and collaboration. Soc Stud Sci. 2011 Oct;41(5):667-90. DOI: 10.1177&#47;0306312711413314</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1177&#47;0306312711413314</RefLink>
      </Reference>
      <Reference refNo="14">
        <RefAuthor>Karr AF</RefAuthor>
        <RefAuthor>Sanil AP</RefAuthor>
        <RefAuthor>Banks DL</RefAuthor>
        <RefTitle>Data quality: A statistical perspective</RefTitle>
        <RefYear>2006</RefYear>
        <RefJournal>Stat Methodol</RefJournal>
        <RefPage>137-73</RefPage>
        <RefTotal>Karr AF, Sanil AP, Banks DL. Data quality: A statistical perspective. Stat Methodol. 2006;3(2):137-73. DOI: 10.1016&#47;j.stamet.2005.08.005</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1016&#47;j.stamet.2005.08.005</RefLink>
      </Reference>
      <Reference refNo="15">
        <RefAuthor>Nadkarni PM</RefAuthor>
        <RefTitle></RefTitle>
        <RefYear>2011</RefYear>
        <RefBookTitle>Metadata-driven software systems in biomedicine: designing systems that can adapt to changing knowledge</RefBookTitle>
        <RefPage></RefPage>
        <RefTotal>Nadkarni PM. Metadata-driven software systems in biomedicine: designing systems that can adapt to changing knowledge. London: Springer; 2011. (Health Informatics). DOI: 10.1007&#47;978-0-85729-510-1</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1007&#47;978-0-85729-510-1</RefLink>
      </Reference>
      <Reference refNo="16">
        <RefAuthor>Montgomery DC</RefAuthor>
        <RefTitle></RefTitle>
        <RefYear>2017</RefYear>
        <RefBookTitle>Design and analysis of experiments</RefBookTitle>
        <RefPage></RefPage>
        <RefTotal>Montgomery DC. Design and analysis of experiments. New Jersey: Wiley; 2017.</RefTotal>
      </Reference>
      <Reference refNo="17">
        <RefAuthor>V&#246;lzke H</RefAuthor>
        <RefAuthor>Alte D</RefAuthor>
        <RefAuthor>Schmidt CO</RefAuthor>
        <RefAuthor>Radke D</RefAuthor>
        <RefAuthor>Lorbeer R</RefAuthor>
        <RefAuthor>Friedrich N</RefAuthor>
        <RefAuthor>Aumann N</RefAuthor>
        <RefAuthor>Lau K</RefAuthor>
        <RefAuthor>Piontek M</RefAuthor>
        <RefAuthor>Born G</RefAuthor>
        <RefAuthor>Havemann C</RefAuthor>
        <RefAuthor>Ittermann T</RefAuthor>
        <RefAuthor>Schipf S</RefAuthor>
        <RefAuthor>Haring R</RefAuthor>
        <RefAuthor>Baumeister SE</RefAuthor>
        <RefAuthor>Wallaschofski H</RefAuthor>
        <RefAuthor>Nauck M</RefAuthor>
        <RefAuthor>Frick S</RefAuthor>
        <RefAuthor>Arnold A</RefAuthor>
        <RefAuthor>J&#252;nger M</RefAuthor>
        <RefAuthor>Mayerle J</RefAuthor>
        <RefAuthor>Kraft M</RefAuthor>
        <RefAuthor>Lerch MM</RefAuthor>
        <RefAuthor>D&#246;rr M</RefAuthor>
        <RefAuthor>Reffelmann T</RefAuthor>
        <RefAuthor>Empen K</RefAuthor>
        <RefAuthor>Felix SB</RefAuthor>
        <RefAuthor>Obst A</RefAuthor>
        <RefAuthor>Koch B</RefAuthor>
        <RefAuthor>Gl&#228;ser S</RefAuthor>
        <RefAuthor>Ewert R</RefAuthor>
        <RefAuthor>Fietze I</RefAuthor>
        <RefAuthor>Penzel T</RefAuthor>
        <RefAuthor>D&#246;ren M</RefAuthor>
        <RefAuthor>Rathmann W</RefAuthor>
        <RefAuthor>Haerting J</RefAuthor>
        <RefAuthor>Hannemann M</RefAuthor>
        <RefAuthor>R&#246;pcke J</RefAuthor>
        <RefAuthor>Schminke U</RefAuthor>
        <RefAuthor>J&#252;rgens C</RefAuthor>
        <RefAuthor>Tost F</RefAuthor>
        <RefAuthor>Rettig R</RefAuthor>
        <RefAuthor>Kors JA</RefAuthor>
        <RefAuthor>Ungerer S</RefAuthor>
        <RefAuthor>Hegenscheid K</RefAuthor>
        <RefAuthor>K&#252;hn JP</RefAuthor>
        <RefAuthor>K&#252;hn J</RefAuthor>
        <RefAuthor>Hosten N</RefAuthor>
        <RefAuthor>Puls R</RefAuthor>
        <RefAuthor>Henke J</RefAuthor>
        <RefAuthor>Gloger O</RefAuthor>
        <RefAuthor>Teumer A</RefAuthor>
        <RefAuthor>Homuth G</RefAuthor>
        <RefAuthor>V&#246;lker U</RefAuthor>
        <RefAuthor>Schwahn C</RefAuthor>
        <RefAuthor>Holtfreter B</RefAuthor>
        <RefAuthor>Polzer I</RefAuthor>
        <RefAuthor>Kohlmann T</RefAuthor>
        <RefAuthor>Grabe HJ</RefAuthor>
        <RefAuthor>Rosskopf D</RefAuthor>
        <RefAuthor>Kroemer HK</RefAuthor>
        <RefAuthor>Kocher T</RefAuthor>
        <RefAuthor>Biffar R</RefAuthor>
        <RefAuthor>John U</RefAuthor>
        <RefAuthor>Hoffmann W</RefAuthor>
        <RefTitle>Cohort profile: the study of health in Pomerania</RefTitle>
        <RefYear>2011</RefYear>
        <RefJournal>Int J Epidemiol</RefJournal>
        <RefPage>294-307</RefPage>
        <RefTotal>V&#246;lzke H, Alte D, Schmidt CO, Radke D, Lorbeer R, Friedrich N, Aumann N, Lau K, Piontek M, Born G, Havemann C, Ittermann T, Schipf S, Haring R, Baumeister SE, Wallaschofski H, Nauck M, Frick S, Arnold A, J&#252;nger M, Mayerle J, Kraft M, Lerch MM, D&#246;rr M, Reffelmann T, Empen K, Felix SB, Obst A, Koch B, Gl&#228;ser S, Ewert R, Fietze I, Penzel T, D&#246;ren M, Rathmann W, Haerting J, Hannemann M, R&#246;pcke J, Schminke U, J&#252;rgens C, Tost F, Rettig R, Kors JA, Ungerer S, Hegenscheid K, K&#252;hn JP, K&#252;hn J, Hosten N, Puls R, Henke J, Gloger O, Teumer A, Homuth G, V&#246;lker U, Schwahn C, Holtfreter B, Polzer I, Kohlmann T, Grabe HJ, Rosskopf D, Kroemer HK, Kocher T, Biffar R, John U, Hoffmann W. Cohort profile: the study of health in Pomerania. Int J Epidemiol. 2011 Apr;40(2):294-307. DOI: 10.1093&#47;ije&#47;dyp394</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1093&#47;ije&#47;dyp394</RefLink>
      </Reference>
      <Reference refNo="18">
        <RefAuthor>Richter A</RefAuthor>
        <RefAuthor>Schauer B</RefAuthor>
        <RefAuthor>Henselin K</RefAuthor>
        <RefAuthor>Junge M</RefAuthor>
        <RefAuthor>Struckmann S</RefAuthor>
        <RefAuthor>Sierocinsky E</RefAuthor>
        <RefAuthor>Henke J</RefAuthor>
        <RefAuthor>Schmidt CO</RefAuthor>
        <RefTitle>Which data and data structures are required to implement automated data quality monitoring in observational studies&#63; Experiences from a population based cohort study</RefTitle>
        <RefYear>2018</RefYear>
        <RefBookTitle>63. Jahrestagung der Deutschen Gesellschaft f&#252;r Medizinische Informatik, Biometrie und Epidemiologie e.V. (GMDS). Osnabr&#252;ck, 02.-06.09.2018.</RefBookTitle>
        <RefPage></RefPage>
        <RefTotal>Richter A, Schauer B, Henselin K, Junge M, Struckmann S, Sierocinsky E, Henke J, Schmidt CO. Which data and data structures are required to implement automated data quality monitoring in observational studies&#63; Experiences from a population based cohort study. In: Deutsche Gesellschaft f&#252;r Medizinische Informatik, Biometrie und Epidemiologie, editor. 63. Jahrestagung der Deutschen Gesellschaft f&#252;r Medizinische Informatik, Biometrie und Epidemiologie e.V. (GMDS). Osnabr&#252;ck, 02.-06.09.2018. D&#252;sseldorf: German Medical Science GMS Publishing House; 2018. DocAbstr. 252. DOI: 10.3205&#47;18gmds016</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.3205&#47;18gmds016</RefLink>
      </Reference>
      <Reference refNo="19">
        <RefAuthor>Werner A</RefAuthor>
        <RefAuthor>Maiwald S</RefAuthor>
        <RefAuthor>Henselin K</RefAuthor>
        <RefAuthor>Westphal S</RefAuthor>
        <RefAuthor>Henke J</RefAuthor>
        <RefAuthor>Alte D</RefAuthor>
        <RefAuthor></RefAuthor>
        <RefTitle>Modular automatisierte Datenbereinigung in einer gro&#223;en Bev&#246;lkerungsstudie</RefTitle>
        <RefYear>2016</RefYear>
        <RefBookTitle>Proceedings der 20. Konferenz der SAS&#174;-Anwender in Forschung und Entwicklung (KSFE)</RefBookTitle>
        <RefPage>279-84</RefPage>
        <RefTotal>Werner A, Maiwald S, Henselin K, Westphal S, Henke J, Alte D, et al. Modular automatisierte Datenbereinigung in einer gro&#223;en Bev&#246;lkerungsstudie &#91;Modular automated data cleaning in a large population-based cohort&#93;. In: Chenot JF, Minkenberg R, editors. Proceedings der 20. Konferenz der SAS&#174;-Anwender in Forschung und Entwicklung (KSFE). Aachen: Shaker; 2016. p. 279-84.</RefTotal>
      </Reference>
      <Reference refNo="20">
        <RefAuthor>Schmidt CO</RefAuthor>
        <RefAuthor>Albers M</RefAuthor>
        <RefAuthor>Henke J</RefAuthor>
        <RefAuthor>Schipf S</RefAuthor>
        <RefAuthor>Baumeister SE</RefAuthor>
        <RefAuthor>Werner A</RefAuthor>
        <RefAuthor></RefAuthor>
        <RefTitle></RefTitle>
        <RefYear></RefYear>
        <RefBookTitle>Quality monitoring in a complex epidemiologic study: Some lessons to be learned. DGEpi Jahrestagung; 2013 Sep 24-27; Leipzig</RefBookTitle>
        <RefPage></RefPage>
        <RefTotal>Schmidt CO, Albers M, Henke J, Schipf S, Baumeister SE, Werner A, et al, editors. Quality monitoring in a complex epidemiologic study: Some lessons to be learned. DGEpi Jahrestagung; 2013 Sep 24-27; Leipzig.</RefTotal>
      </Reference>
      <Reference refNo="21">
        <RefAuthor>Wang RY</RefAuthor>
        <RefAuthor>Strong DM</RefAuthor>
        <RefTitle>Beyond accuracy: What data quality means to data consumers</RefTitle>
        <RefYear>1996</RefYear>
        <RefJournal>J Manag Inf Syst</RefJournal>
        <RefPage>5-33</RefPage>
        <RefTotal>Wang RY, Strong DM. Beyond accuracy: What data quality means to data consumers. J Manag Inf Syst. 1996;12(4):5-33. DOI: 10.1080&#47;07421222.1996.11518099</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1080&#47;07421222.1996.11518099</RefLink>
      </Reference>
      <Reference refNo="22">
        <RefAuthor>Watts S</RefAuthor>
        <RefAuthor>Shankaranarayanan G</RefAuthor>
        <RefAuthor>Even A</RefAuthor>
        <RefTitle>Data quality assessment in context: A cognitive perspective</RefTitle>
        <RefYear>2009</RefYear>
        <RefJournal>Decis Support Syst</RefJournal>
        <RefPage>202-11</RefPage>
        <RefTotal>Watts S, Shankaranarayanan G, Even A. Data quality assessment in context: A cognitive perspective. Decis Support Syst. 2009;48(1):202-11. DOI: 10.1016&#47;j.dss.2009.07.012</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1016&#47;j.dss.2009.07.012</RefLink>
      </Reference>
      <Reference refNo="23">
        <RefAuthor>Brown JS</RefAuthor>
        <RefAuthor>Kahn M</RefAuthor>
        <RefAuthor>Toh S</RefAuthor>
        <RefTitle>Data quality assessment for comparative effectiveness research in distributed data networks</RefTitle>
        <RefYear>2013</RefYear>
        <RefJournal>Med Care</RefJournal>
        <RefPage>S22-9</RefPage>
        <RefTotal>Brown JS, Kahn M, Toh S. Data quality assessment for comparative effectiveness research in distributed data networks. Med Care. 2013 Aug;51(8 Suppl 3):S22-9. DOI: 10.1097&#47;MLR.0b013e31829b1e2c</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1097&#47;MLR.0b013e31829b1e2c</RefLink>
      </Reference>
      <Reference refNo="24">
        <RefAuthor>Dugas M</RefAuthor>
        <RefAuthor>Meidt A</RefAuthor>
        <RefAuthor>Neuhaus P</RefAuthor>
        <RefAuthor>Storck M</RefAuthor>
        <RefAuthor>Varghese J</RefAuthor>
        <RefTitle>ODMedit: uniform semantic annotation for data integration in medicine based on a public metadata repository</RefTitle>
        <RefYear>2016</RefYear>
        <RefJournal>BMC Med Res Methodol</RefJournal>
        <RefPage>65</RefPage>
        <RefTotal>Dugas M, Meidt A, Neuhaus P, Storck M, Varghese J. ODMedit: uniform semantic annotation for data integration in medicine based on a public metadata repository. BMC Med Res Methodol. 2016 06;16:65. DOI: 10.1186&#47;s12874-016-0164-9</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1186&#47;s12874-016-0164-9</RefLink>
      </Reference>
      <Reference refNo="25">
        <RefAuthor>Cai L</RefAuthor>
        <RefAuthor>Zhu Y</RefAuthor>
        <RefTitle>The challenges of data quality and data quality assessment in the big data era</RefTitle>
        <RefYear>2015</RefYear>
        <RefJournal>Data Sci J</RefJournal>
        <RefPage>2</RefPage>
        <RefTotal>Cai L, Zhu Y. The challenges of data quality and data quality assessment in the big data era. Data Sci J. 2015;14:2. DOI: 10.5334&#47;dsj-2015-002</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.5334&#47;dsj-2015-002</RefLink>
      </Reference>
      <Reference refNo="26">
        <RefAuthor>Dugas M</RefAuthor>
        <RefAuthor>Neuhaus P</RefAuthor>
        <RefAuthor>Meidt A</RefAuthor>
        <RefAuthor>Doods J</RefAuthor>
        <RefAuthor>Storck M</RefAuthor>
        <RefAuthor>Bruland P</RefAuthor>
        <RefAuthor>Varghese J</RefAuthor>
        <RefTitle>Portal of medical data models: information infrastructure for medical research and healthcare</RefTitle>
        <RefYear>2016</RefYear>
        <RefJournal>Database (Oxford)</RefJournal>
        <RefPage>bav121</RefPage>
        <RefTotal>Dugas M, Neuhaus P, Meidt A, Doods J, Storck M, Bruland P, Varghese J. Portal of medical data models: information infrastructure for medical research and healthcare. Database (Oxford). 2016;2016:bav121. DOI: 10.1093&#47;database&#47;bav121</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1093&#47;database&#47;bav121</RefLink>
      </Reference>
      <Reference refNo="27">
        <RefAuthor>Lu Z</RefAuthor>
        <RefAuthor>Su J</RefAuthor>
        <RefTitle>Clinical data management: Current status, challenges, and future directions from industry perspectives</RefTitle>
        <RefYear>2010</RefYear>
        <RefJournal>Open Access J Clin Trials</RefJournal>
        <RefPage>93-105</RefPage>
        <RefTotal>Lu Z, Su J. Clinical data management: Current status, challenges, and future directions from industry perspectives. Open Access J Clin Trials. 2010;2:93-105. DOI: 10.2147&#47;OAJCT.S8172</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.2147&#47;OAJCT.S8172</RefLink>
      </Reference>
      <Reference refNo="28">
        <RefAuthor>Assenov Y</RefAuthor>
        <RefAuthor>M&#252;ller F</RefAuthor>
        <RefAuthor>Lutsik P</RefAuthor>
        <RefAuthor>Walter J</RefAuthor>
        <RefAuthor>Lengauer T</RefAuthor>
        <RefAuthor>Bock C</RefAuthor>
        <RefTitle>Comprehensive analysis of DNA methylation data with RnBeads</RefTitle>
        <RefYear>2014</RefYear>
        <RefJournal>Nat Methods</RefJournal>
        <RefPage>1138-40</RefPage>
        <RefTotal>Assenov Y, M&#252;ller F, Lutsik P, Walter J, Lengauer T, Bock C. Comprehensive analysis of DNA methylation data with RnBeads. Nat Methods. 2014 Nov;11(11):1138-40. DOI: 10.1038&#47;nmeth.3115</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1038&#47;nmeth.3115</RefLink>
      </Reference>
      <Reference refNo="29">
        <RefAuthor>Prinz F</RefAuthor>
        <RefAuthor>Schlange T</RefAuthor>
        <RefAuthor>Asadullah K</RefAuthor>
        <RefTitle>Believe it or not: how much can we rely on published data on potential drug targets&#63;</RefTitle>
        <RefYear>2011</RefYear>
        <RefJournal>Nat Rev Drug Discov</RefJournal>
        <RefPage>712</RefPage>
        <RefTotal>Prinz F, Schlange T, Asadullah K. Believe it or not: how much can we rely on published data on potential drug targets&#63; Nat Rev Drug Discov. 2011 Aug;10(9):712. DOI: 10.1038&#47;nrd3439-c1</RefTotal>
        <RefLink>https:&#47;&#47;doi.org&#47;10.1038&#47;nrd3439-c1</RefLink>
      </Reference>
    </References>
    <Media>
      <Tables>
        <Table format="png">
          <MediaNo>1</MediaNo>
          <MediaID>1</MediaID>
          <Caption><Pgraph><Mark1>Table 1: Examples of metadata used for data quality monitoring</Mark1></Pgraph></Caption>
        </Table>
        <Table format="png">
          <MediaNo>2</MediaNo>
          <MediaID>2</MediaID>
          <Caption><Pgraph><Mark1>Table 2: Examples of process variables used for data quality assessments</Mark1></Pgraph></Caption>
        </Table>
        <NoOfTables>2</NoOfTables>
      </Tables>
      <Figures>
        <Figure format="png" height="484" width="1126">
          <MediaNo>1</MediaNo>
          <MediaID>1</MediaID>
          <Caption><Pgraph><Mark1>Figure 1: Left panel: study data usually comprise identifier, measurements (e.g. c-reactive protein (CRP), and process variables (e.g. examiner ID). In some cases metadata variables have to be added, if applicable metadata vary across observations. </Mark1><LineBreak></LineBreak><Mark1>Top right panel: selection of static metadata with 1:1 relation to columns of study data. </Mark1><LineBreak></LineBreak><Mark1>Bottom right panel: relations between study data, metadata, and links&#42; between study data. </Mark1><LineBreak></LineBreak>&#42; Relations between two or more related study data variables should be defined in the metadata attributes, e.g. CRP-laboratory results and the examiner who drew the blood sample. </Pgraph></Caption>
        </Figure>
        <NoOfPictures>1</NoOfPictures>
      </Figures>
      <InlineFigures>
        <NoOfPictures>0</NoOfPictures>
      </InlineFigures>
      <Attachments>
        <NoOfAttachments>0</NoOfAttachments>
      </Attachments>
    </Media>
  </OrigData>
</GmsArticle>