Useful PHP Code: validation

Validating email addresses is important step in maintaining quality user data on any online application. Here is a simple PHP email validation function. Enjoy and provide any improvements via comments.

function isEmail($email)
{
if (preg_match("/^(\w+((-\w+)|(\w.\w+))*)\@(\w+((\.|-)\w+)*\.\w+$)/",$email))
{
return true;
}
else 
{
return false;
}
}

For something (a lot) more robust, a great piece of code by Cal Henderson is given below - it is a large piece of code, but it will ensure that you are validating 99% of all the possible email addresses correctly.

<?php

 #
 # RFC3696 Email Parser
 #
 # By Cal Henderson <cal@iamcal.com>
 #
 # This code is dual licensed:
 # CC Attribution-ShareAlike 2.5 - http://creativecommons.org/licenses/by-sa/2.5/
 # GPLv3 - http://www.gnu.org/copyleft/gpl.html
 #
 # $Revision: 5039 $
 #

 ##################################################################################

 function is_rfc3696_valid_email_address($email){


  ####################################################################################
  #
  # NO-WS-CTL       =       %d1-8 /         ; US-ASCII control characters
  #                         %d11 /          ;  that do not include the
  #                         %d12 /          ;  carriage return, line feed,
  #                         %d14-31 /       ;  and white space characters
  #                         %d127
  # ALPHA          =  %x41-5A / %x61-7A   ; A-Z / a-z
  # DIGIT          =  %x30-39

  $no_ws_ctl = "[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x7f]";
  $alpha  = "[\\x41-\\x5a\\x61-\\x7a]";
  $digit  = "[\\x30-\\x39]";
  $cr  = "\\x0d";
  $lf  = "\\x0a";
  $crlf  = "(?:$cr$lf)";


  ####################################################################################
  #
  # obs-char        =       %d0-9 / %d11 /          ; %d0-127 except CR and
  #                         %d12 / %d14-127         ;  LF
  # obs-text        =       *LF *CR *(obs-char *LF *CR)
  # text            =       %d1-9 /         ; Characters excluding CR and LF
  #                         %d11 /
  #                         %d12 /
  #                         %d14-127 /
  #                         obs-text
  # obs-qp          =       "\" (%d0-127)
  # quoted-pair     =       ("\" text) / obs-qp

  $obs_char = "[\\x00-\\x09\\x0b\\x0c\\x0e-\\x7f]";
  $obs_text = "(?:$lf*$cr*(?:$obs_char$lf*$cr*)*)";
  $text  = "(?:[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f]|$obs_text)";

  #
  # there's an issue with the definition of 'text', since 'obs_text' can
  # be blank and that allows qp's with no character after the slash. we're
  # treating that as bad, so this just checks we have at least one
  # (non-CRLF) character
  #

  $text  = "(?:$lf*$cr*$obs_char$lf*$cr*)";
  $obs_qp  = "(?:\\x5c[\\x00-\\x7f])";
  $quoted_pair = "(?:\\x5c$text|$obs_qp)";


  ####################################################################################
  #
  # obs-FWS         =       1*WSP *(CRLF 1*WSP)
  # FWS             =       ([*WSP CRLF] 1*WSP) /   ; Folding white space
  #                         obs-FWS
  # ctext           =       NO-WS-CTL /     ; Non white space controls
  #                         %d33-39 /       ; The rest of the US-ASCII
  #                         %d42-91 /       ;  characters not including "(",
  #                         %d93-126        ;  ")", or "\"
  # ccontent        =       ctext / quoted-pair / comment
  # comment         =       "(" *([FWS] ccontent) [FWS] ")"
  # CFWS            =       *([FWS] comment) (([FWS] comment) / FWS)

  #
  # note: we translate ccontent only partially to avoid an infinite loop
  # instead, we'll recursively strip *nested* comments before processing
  # the input. that will leave 'plain old comments' to be matched during
  # the main parse.
  #

  $wsp  = "[\\x20\\x09]";
  $obs_fws = "(?:$wsp+(?:$crlf$wsp+)*)";
  $fws  = "(?:(?:(?:$wsp*$crlf)?$wsp+)|$obs_fws)";
  $ctext  = "(?:$no_ws_ctl|[\\x21-\\x27\\x2A-\\x5b\\x5d-\\x7e])";
  $ccontent = "(?:$ctext|$quoted_pair)";
  $comment = "(?:\\x28(?:$fws?$ccontent)*$fws?\\x29)";
  $cfws  = "(?:(?:$fws?$comment)*(?:$fws?$comment|$fws))";


  #
  # these are the rules for removing *nested* comments. we'll just detect
  # outer comment and replace it with an empty comment, and recurse until
  # we stop.
  #

  $outer_ccontent_dull = "(?:$fws?$ctext|$quoted_pair)";
  $outer_ccontent_nest = "(?:$fws?$comment)";
  $outer_comment  = "(?:\\x28$outer_ccontent_dull*(?:$outer_ccontent_nest$outer_ccontent_dull*)+$fws?\\x29)";


  ####################################################################################
  #
  # atext           =       ALPHA / DIGIT / ; Any character except controls,
  #                         "!" / "#" /     ;  SP, and specials.
  #                         "$" / "%" /     ;  Used for atoms
  #                         "&" / "'" /
  #                         "*" / "+" /
  #                         "-" / "/" /
  #                         "=" / "?" /
  #                         "^" / "_" /
  #                         "`" / "{" /
  #                         "|" / "}" /
  #                         "~"
  # atom            =       [CFWS] 1*atext [CFWS]

  $atext  = "(?:$alpha|$digit|[\\x21\\x23-\\x27\\x2a\\x2b\\x2d\\x2f\\x3d\\x3f\\x5e\\x5f\\x60\\x7b-\\x7e])";
  $atom  = "(?:$cfws?(?:$atext)+$cfws?)";


  ####################################################################################
  #
  # qtext           =       NO-WS-CTL /     ; Non white space controls
  #                         %d33 /          ; The rest of the US-ASCII
  #                         %d35-91 /       ;  characters not including "\"
  #                         %d93-126        ;  or the quote character
  # qcontent        =       qtext / quoted-pair
  # quoted-string   =       [CFWS]
  #                         DQUOTE *([FWS] qcontent) [FWS] DQUOTE
  #                         [CFWS]
  # word            =       atom / quoted-string

  $qtext  = "(?:$no_ws_ctl|[\\x21\\x23-\\x5b\\x5d-\\x7e])";
  $qcontent = "(?:$qtext|$quoted_pair)";
  $quoted_string = "(?:$cfws?\\x22(?:$fws?$qcontent)*$fws?\\x22$cfws?)";

  #
  # changed the '*' to a '+' to require that quoted strings are not empty
  #

  $quoted_string = "(?:$cfws?\\x22(?:$fws?$qcontent)+$fws?\\x22$cfws?)";
  $word  = "(?:$atom|$quoted_string)";


  ####################################################################################
  #
  # obs-local-part  =       word *("." word)
  # obs-domain      =       atom *("." atom)

  $obs_local_part = "(?:$word(?:\\x2e$word)*)";
  $obs_domain = "(?:$atom(?:\\x2e$atom)*)";


  ####################################################################################
  #
  # dot-atom-text   =       1*atext *("." 1*atext)
  # dot-atom        =       [CFWS] dot-atom-text [CFWS]

  $dot_atom_text = "(?:$atext+(?:\\x2e$atext+)*)";
  $dot_atom = "(?:$cfws?$dot_atom_text$cfws?)";


  ####################################################################################
  #
  # domain-literal  =       [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS]
  # dcontent        =       dtext / quoted-pair
  # dtext           =       NO-WS-CTL /     ; Non white space controls
  # 
  #                         %d33-90 /       ; The rest of the US-ASCII
  #                         %d94-126        ;  characters not including "[",
  #                                         ;  "]", or "\"

  $dtext  = "(?:$no_ws_ctl|[\\x21-\\x5a\\x5e-\\x7e])";
  $dcontent = "(?:$dtext|$quoted_pair)";
  $domain_literal = "(?:$cfws?\\x5b(?:$fws?$dcontent)*$fws?\\x5d$cfws?)";


  ####################################################################################
  #
  # local-part      =       dot-atom / quoted-string / obs-local-part
  # domain          =       dot-atom / domain-literal / obs-domain
  # addr-spec       =       local-part "@" domain

  $local_part = "(($dot_atom)|($quoted_string)|($obs_local_part))";
  $domain  = "(($dot_atom)|($domain_literal)|($obs_domain))";
  $addr_spec = "$local_part\\x40$domain";



  #
  # see http://www.dominicsayers.com/isemail/ for details, but this should probably be 254
  #

  if (strlen($email) > 256) return 0;


  #
  # we need to strip nested comments first - we replace them with a simple comment
  #

  $email = rfc3696_strip_comments($outer_comment, $email, "(x)");


  #
  # now match what's left
  #

  if (!preg_match("!^$addr_spec$!", $email, $m)){

   return 0;
  }

  $bits = array(
   'local'   => isset($m[1]) ? $m[1] : '',
   'local-atom'  => isset($m[2]) ? $m[2] : '',
   'local-quoted'  => isset($m[3]) ? $m[3] : '',
   'local-obs'  => isset($m[4]) ? $m[4] : '',
   'domain'  => isset($m[5]) ? $m[5] : '',
   'domain-atom'  => isset($m[6]) ? $m[6] : '',
   'domain-literal' => isset($m[7]) ? $m[7] : '',
   'domain-obs'  => isset($m[8]) ? $m[8] : '',
  );


  #
  # we need to now strip comments from $bits[local] and $bits[domain],
  # since we know they're i the right place and we want them out of the
  # way for checking IPs, label sizes, etc
  #

  $bits['local'] = rfc3696_strip_comments($comment, $bits['local']);
  $bits['domain'] = rfc3696_strip_comments($comment, $bits['domain']);


  #
  # length limits on segments
  #

  if (strlen($bits['local']) > 64) return 0;
  if (strlen($bits['domain']) > 255) return 0;


  #
  # restrictuions on domain-literals from RFC2821 section 4.1.3
  #

  if (strlen($bits['domain-literal'])){

   $Snum   = "(\d{1,3})";
   $IPv4_address_literal = "$Snum\.$Snum\.$Snum\.$Snum";

   $IPv6_hex  = "(?:[0-9a-fA-F]{1,4})";

   $IPv6_full  = "IPv6\:$IPv6_hex(:?\:$IPv6_hex){7}";

   $IPv6_comp_part  = "(?:$IPv6_hex(?:\:$IPv6_hex){0,5})?";
   $IPv6_comp  = "IPv6\:($IPv6_comp_part\:\:$IPv6_comp_part)";

   $IPv6v4_full  = "IPv6\:$IPv6_hex(?:\:$IPv6_hex){5}\:$IPv4_address_literal";

   $IPv6v4_comp_part = "$IPv6_hex(?:\:$IPv6_hex){0,3}";
   $IPv6v4_comp  = "IPv6\:((?:$IPv6v4_comp_part)?\:\:(?:$IPv6v4_comp_part\:)?)$IPv4_address_literal";


   #
   # IPv4 is simple
   #

   if (preg_match("!^\[$IPv4_address_literal\]$!", $bits['domain'], $m)){

    if (intval($m[1]) > 255) return 0;
    if (intval($m[2]) > 255) return 0;
    if (intval($m[3]) > 255) return 0;
    if (intval($m[4]) > 255) return 0;

   }else{

    #
    # this should be IPv6 - a bunch of tests are needed here :)
    #

    while (1){

     if (preg_match("!^\[$IPv6_full\]$!", $bits['domain'])){
      break;
     }

     if (preg_match("!^\[$IPv6_comp\]$!", $bits['domain'], $m)){
      list($a, $b) = explode('::', $m[1]);
      $folded = (strlen($a) && strlen($b)) ? "$a:$b" : "$a$b";
      $groups = explode(':', $folded);
      if (count($groups) > 6) return 0;
      break;
     }

     if (preg_match("!^\[$IPv6v4_full\]$!", $bits['domain'], $m)){

      if (intval($m[1]) > 255) return 0;
      if (intval($m[2]) > 255) return 0;
      if (intval($m[3]) > 255) return 0;
      if (intval($m[4]) > 255) return 0;
      break;
     }

     if (preg_match("!^\[$IPv6v4_comp\]$!", $bits['domain'], $m)){
      list($a, $b) = explode('::', $m[1]);
      $b = substr($b, 0, -1); # remove the trailing colon before the IPv4 address
      $folded = (strlen($a) && strlen($b)) ? "$a:$b" : "$a$b";
      $groups = explode(':', $folded);
      if (count($groups) > 4) return 0;
      break;
     }

     return 0;
    }
   }   
  }else{

   #
   # the domain is either dot-atom or obs-domain - either way, it's
   # made up of simple labels and we split on dots
   #

   $labels = explode('.', $bits['domain']);


   #
   # this is allowed by both dot-atom and obs-domain, but is un-routeable on the
   # public internet, so we'll fail it (e.g. user@localhost)
   #

   if (count($labels) == 1) return 0;


   #
   # checks on each label
   #

   foreach ($labels as $label){

    if (strlen($label) > 63) return 0;
    if (substr($label, 0, 1) == '-') return 0;
    if (substr($label, -1) == '-') return 0;
   }


   #
   # last label can't be all numeric
   #

   if (preg_match('!^[0-9]+$!', array_pop($labels))) return 0;
  }


  return 1;
 }

 ##################################################################################

 function rfc3696_strip_comments($comment, $email, $replace=''){

  while (1){
   $new = preg_replace("!$comment!", $replace, $email);
   if (strlen($new) == strlen($email)){
    return $email;
   }
   $email = $new;
  }
 }

 ##################################################################################
?>

Very nice code by Mr. Henderson, I think you'll agree.

Useful PHP Code

PHP UK Postcode Validation

Robust PHP Email Validator

Blog Archive

Search This Blog