*
  • cDATA symbol substitution: substituting one type of symbol for another, e.g. 'a' for 'á'. The new content * is placed within a new tag with a '_s', e.g. á becomes a.
  • *
  • Field removal: specified xml tags and associated content can be discluded in the modified xml document.
  • *
  • alphabetization tag: A new tag is created containing a value which will be used to sort query results. * The value will be determined by a set of conditions supplied by the user.
  • * * * The parsing ability of this class actually provides two seperate but related parses. * * I chose to use the same class to provide both functions because it seemed a waste to include an entire new * class to provide the functionality of creating a list of fields. In addition the results from this aggregation * are used by the modification function indirectly. The search functionality reports back not only which fields * exist in the XML document but also which fields contain characters contained in the $symbols and $first_chars arrays. * This information is generally published to the user, allowing him to select which fields he wants to include in the * MySQL version of his XML file and which fields he may want to add substituted versions of. * Perhaps in a future version this class should be split into two.
    * NOTE: In PHP4, you do not need to first decode cdata from utf8. PHP5 assumes the source is in utf8, so you must * first decode the cdata. * * @package JLex5.0 * @author Jonathan Dick * @date Dec. 30, 2005 */ class db_modifier { /**#@+ * @access public */ public $parser; /** * The file pointer for writing to disk. * @var resource */ public $out; /** * The current cDATA. * @var string */ public $cur_data; /** * The symbols to be substituted for. Note that I put these symbols and the substitutes in seperate arrays in order * to utilize the php function str_replace which takes in a symbols array and a substitutes array. * @var array of strings */ public $symbols; /** * An array of the substitue symbols. * @var array of strings */ public $substitutes; /** * The user may supply substitution rules which only apply to the first character. This array contains * the symbols which will be substituted for if found in the first character position. * @var array of strings */ public $first_chars; /** * This is a regular expression composed of the values from the $symbols and $first_chars array. * * It is used to determine if the content of a particular field meets the conditions specified by * the $symbols and $first_chars arrays, i.e. if it contains symbols found in those arrays. * @var string */ public $pattern; /** * The set of symbols to be replaced from the alpha field. * @var array of strings */ public $alpha_symbols; /** * The set of substitutes for the $alpha_symbols. * @var array of strings */ public $alpha_substitutes; /** * The set of first characters which should be substituted for in the alpha field. * @var array of strings */ public $alpha_first_chars; /** * The regular expression representing the conditions set forth in the $alpha_symbols and $alpha_first_chars arrays. * @var string */ public $alpha_pattern; /** * The fields which will be used to create the alpha column. * * The first field will be used in the alpha column if it exists. If that field does not exist * then the second field will be used and so on. The rules specified by the $alpha_symbols and * $alpha_first_chars will then be applied to the value of the field selected. * @var array of strings */ public $sort_fields; /** * A boolean variable which is true when sort fields are provided by the user. * */ public $has_sort_fields; /** * The current value of the alpha column. * * Because the document is parsed in a linear fashion, we don't know if a field exists * until we parse that field. Consequently, if any sort field is encountered we must save * the value and only discard once we reach a sort field of higher precedence. */ public $alpha; /** * The index within the array of the field whose content currently exists in the $alpha variable. * * When a field contained in the $sort_fields array is parsed, it's index is retrieved from the array * and compared to the value in $cur_alpha_index. If it is lower, the value of this field will replace * the content currently stored in $alpha. * * @var integer */ public $cur_alpha_index; /** * The XML tag marking the beginning of an entry. * @var string */ public $head_tag; /** * This boolean variable is true when the function being performed by this parse is modification. * @var boolean */ public $produce_stripped_version; /** * The set of fields in the XML document which contain cDATA obeying the rules found in the * $symbols and $first_chars arrays. * * This array is filled when parsing for field types, i.e. the first functionality discussed above. */ public $strippable_fields; /** * This variable holds two types of fieldsets depending on the function being implemented. If function 1 * (field search) is being used, this variable contains the growing list of fields found within entries in * the XML document. If function 2 (modification) is being used, this contains the set of fields to be * included in the modified version of the XML document. * @var array */ public $fields; /** * The set of fields for which modified versions should be created based on the rules in the * $symbols and $first_chars array. * @var array */ public $fields_to_strip; /** * Boolean variable which is set to true one a start $head_tag is found and false when an end $head_tag is parsed. * @var boolean */ public $in_entry; /**#@-*/ function __construct() { $this->fields_to_strip = array(); $this->check_all_fields = false; $this->count = 0; $this->cur_data = ""; $this->symbols = array(); $this->substitutes = array(); $this->pattern = ""; $this->alpha_symbols = array(); $this->alpha_substitutes = array(); $this->alpha_first_chars = array(); $this->alpha_pattern = ""; $this->sort_fields = array(); $this->has_sort_fields = false; $this->alpha = ""; $this->cur_alpha_index = 10; $this->strippable_fields = array(); $this->fields = array(); $this->fields_to_strip = array(); $this->in_entry = false; } /** * Extracts strings from space-seperated string and inputs them into the $sort_fields array. * @param string $sort_fields A space seperated list of fields contained in the XML document. * @return void The fields are inputted into the global variable $sort_fields. */ function get_sort_fields($sort_fields) { $fields = explode(" ",$sort_fields); foreach($fields as $field) { $this->sort_fields[] = $field; } } /** * Extracts fields, one per line, from a file and returns them in an array. * * This function is no longer used. * @param string $fields The filename containing the fields to be extracted. * @return array The fields contained within the file. */ function get_fields_to_strip($fields) { $in = fopen($fields,"r"); while($line = fgets($in)) { $field = trim($line); $fields_to_strip[] = $field; } return $fields_to_strip; } /** * This function takes user input specifying substitution rules contained in a textarea within an HTML form * and inputs the symbols into the $alpha_symbols array and their substitutes into the $alpha_substitutes array. * * Each line of the textarea should (if properly formatted by the user) a substitution rule in the form "X = Y". * Additionally, the '' is used to specify that a symbol will be deleted rather than subsituted for, e.g. "X = ''". * A user may specify multiple symbols to be replaed by a single substitute, e.g. "WXY = Z". * Lastly, all the symbols which need to be subsituted for are concatenated and made into a regular expression * * @param string $textarea A textarea from an HTML form. * @return void Only global variables are affected, no results are returned. */ function make_alpha_symbols_table($textarea) { $textarea = stripslashes($textarea); $lines = explode("\n",$textarea); foreach($lines as $line) { $vals = explode(" = ",$line); $substitute = trim($vals[1]); if($substitute == "''") { $substitute = ""; } $symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY); foreach($symbols as $symbol) { $this->alpha_symbols[] = $symbol; $this->alpha_substitutes[] = $substitute; } } $symbols_string = implode("",$this->alpha_symbols); if(substr($symbols_string,-1,1) == "'") { $symbols_string = "'".substr($symbols_string,0,-1); } if($symbols_string != "") { $this->alpha_pattern = "[".$symbols_string."]"; } } /** * This function takes user input specifying substitution rules contained in a textarea within an HTML form * and inputs the results into $alpha_first chars, an associative array indexed by the symbol to be replaced * and containing the substitute symbol. * * Note that in this case, two arrays are not used to store the symbols. This is because only the first character * should be substituted for, not a set of characters.
    * Each line of the textarea should (if properly formatted by the user) a substitution rule in the form "X = Y". * Additionally, the '' is used to specify that a symbol will be deleted rather than subsituted for, e.g. "X = ''". * A user may specify multiple symbols to be replaed by a single substitute, e.g. "WXY = Z". * Lastly, all the symbols which need to be subsituted for are concatenated and made into a regular expression * * @param string $textarea A textarea from an HTML form. * @return void Only global variables are affected, no results are returned. */ function make_alpha_first_chars_table($textarea) { $textarea = trim(stripslashes($textarea)); $lines = explode("\n",$textarea); foreach($lines as $line) { $vals = explode(" = ",$line); $substitute = trim($vals[1]); if($substitute == "''") { $substitute = ""; } $symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY); foreach($symbols as $symbol) { $this->alpha_first_chars[$symbol] = $substitute; } } $first_chars_string = trim(implode("",array_keys($this->alpha_first_chars))); if(substr($first_chars_string,-1,1) == "'") { $first_chars_string = "'".substr($first_chars_string,0,-1); } if($first_chars_string != "") { if($this->alpha_pattern != "") { $this->alpha_pattern .= "|"; } $this->alpha_pattern .= "^[".$first_chars_string."]"; } } /** * This function takes user input specifying substitution rules contained in a textarea within an HTML form * and inputs the symbols into the $symbols array and their substitutes into the $substitutes array. * * Each line of the textarea should (if properly formatted by the user) a substitution rule in the form "X = Y". * Additionally, the '' is used to specify that a symbol will be deleted rather than subsituted for, e.g. "X = ''". * A user may specify multiple symbols to be replaed by a single substitute, e.g. "WXY = Z". * Lastly, all the symbols which need to be subsituted for are concatenated and made into a regular expression * * @param string $textarea A textarea from an HTML form. * @return void Only global variables are affected, no results are returned. */ function make_symbols_table($textarea) { $textarea = stripslashes($textarea); $lines = explode("\n",$textarea); foreach($lines as $line) { $vals = explode(" = ",$line); $substitute = trim($vals[1]); if($substitute == "''") { $substitute = ""; } $symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY); foreach($symbols as $symbol) { $this->symbols[] = $symbol; $this->substitutes[] = $substitute; } } $symbols_string = implode("",$this->symbols); if(substr($symbols_string,-1,1) == "'") { $symbols_string = "'".substr($symbols_string,0,-1); } if($symbols_string != "") { $this->pattern = "[".$symbols_string."]"; } } /** * This function takes user input specifying substitution rules contained in a textarea within an HTML form * and inputs the results into $first chars, an associative array indexed by the symbol to be replaced * and containing the substitute symbol. * * Note that in this case, two arrays are not used to store the symbols. This is because only the first character * should be substituted for, not a set of characters.
    * Each line of the textarea should (if properly formatted by the user) a substitution rule in the form "X = Y". * Additionally, the '' is used to specify that a symbol will be deleted rather than subsituted for, e.g. "X = ''". * A user may specify multiple symbols to be replaed by a single substitute, e.g. "WXY = Z". * Lastly, all the symbols which need to be subsituted for are concatenated and made into a regular expression * * @param string $textarea A textarea from an HTML form. * @return void Only global variables are affected, no results are returned. */ function make_first_chars_table($textarea) { $textarea = trim(stripslashes($textarea)); $lines = explode("\n",$textarea); foreach($lines as $line) { $vals = explode(" = ",$line); $substitute = trim($vals[1]); if($substitute == "''") { $substitute = ""; } $symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY); foreach($symbols as $symbol) { $this->first_chars[$symbol] = $substitute; } } $first_chars_string = trim(implode("",array_keys($this->first_chars))); if(substr($first_chars_string,-1,1) == "'") { $first_chars_string = "'".substr($first_chars_string,0,-1); } if($first_chars_string != "") { if($this->pattern != "") { $this->pattern .= "|"; } $this->pattern .= "^[".$first_chars_string."]"; } } /** * Replace the specified $value with the stripped version according to the conditions * specified by $alpha_symbols and $alpha_first_chars. * * @param string $value The content to be modified. * @return string The modified content. */ function alpha_strip($value) { $stripped = str_replace($this->alpha_symbols,$this->alpha_substitutes,$value); $first_char = substr($stripped,0,1); while(@array_key_exists($first_char,$this->alpha_first_chars)) { $stripped = $this->alpha_first_chars[$first_char].substr($stripped,1); $first_char = substr($stripped,0,1); } return $stripped; } /** * Replace the specified $value with the stripped version according to the conditions * specified by $alpha_symbols and $alpha_first_chars. * * @param string $value The content to be modified. * @return string The modified content. */ function strip($value) { $stripped = str_replace($this->symbols,$this->substitutes,$value); $first_char = substr($stripped,0,1); while(@array_key_exists($first_char,$this->first_chars)) { $stripped = $this->first_chars[$first_char].substr($stripped,1); $first_char = substr($stripped,0,1); } return $stripped; } /** * The function which handles a new start tag during XML parsing. * * If function 1 (field searching) is used, do nothing.
    * If function 2 (modification) is used, write the element to the new, modified XML document if: * */ function startHandler($xp, $element, $attribs) { $element = strtolower($element); if($element == $this->head_tag) { $this->in_entry = true; } if($this->produce_stripped_version) { if($this->in_entry) { if(in_array($element,$this->fields)) { $element = str_replace("-","_",$element); fwrite($this->out,"<$element>"); } } else { fwrite($this->out,"<$element>\n"); } } } /** * The function for handling an end tag during an XML parse. * * If function 2 (modification) is being used: * * If function 1 (field search) is being used: if the element is not already there, add it to $fields.
    * Lastly, check to see if the field meets the conditions set in $pattern. If there is a pattern match: * */ function endHandler($xp, $element) { $element = strtolower($element); if($this->produce_stripped_version) { $element = str_replace("-","_",$element); if($this->has_sort_fields && in_array($element, $this->sort_fields)) { $index = array_search($element,$this->sort_fields); if($index < $this->cur_alpha_index) { $this->alpha = $this->alpha_strip($this->cur_data); $this->cur_alpha_index = $index; } } if(($element == $this->head_tag) && $this->has_sort_fields) { fwrite($this->out,"$this->alpha\n"); $this->alpha = ""; $this->cur_alpha_index = count($this->sort_fields); } if($this->in_entry) { if(in_array($element,$this->fields)) { fwrite($this->out,"$this->cur_data\n"); } } else { fwrite($this->out,"$this->cur_data\n"); } } else { if($this->in_entry) { if(!in_array($element,$this->fields)) { $this->fields[] = $element; } } } if($this->pattern != "") { if(ereg($this->pattern,$this->cur_data)) { if($this->produce_stripped_version) { if(in_array($element,$this->fields_to_strip)) { $stripped_value = $this->strip($this->cur_data); fwrite($this->out,"<".$element."_s>$stripped_value\n"); } } else { if(!in_array($element,$this->strippable_fields)) { $this->strippable_fields[] = $element; } } } } if($element == $this->head_tag) { $this->in_entry = false; } $this->cur_data = ""; } /** * Function for handling cDATA during parsing. If the cDATA is not an empty string, append it to then end * of $cur_data. */ function cDataHandler($xp, $data) { $data = trim($data); if($data != "") { $data = ereg_replace("&","&",$data); $data = ereg_replace("<","<",$data); $data = ereg_replace(">",">",$data); $this->cur_data .= $data; } } /** * A function to remove html encodings of special characters. */ function unhtmlentities ($string) { $trans_tbl = get_html_translation_table (HTML_ENTITIES); $trans_tbl = array_flip ($trans_tbl); $ret = strtr ($string, $trans_tbl); return preg_replace('/\&\#([0-9]+)\;/me', "chr('\\1')",$ret); } /** * Function for parsing an XML document and collecting the set of all fields used within entries * and collecting the set of all fields containing characters meeting the $pattern conditions. * * As noted above, this is the first function, which only searches but produces no new XML document. * * @param string $xml The filename of the XML document to be parsed. * @param string $head_tag The tag marking the beginning of an entry. * @param $symbols string The textarea containing the rules for symbol substitition * @param $first_chars The textarea containing the rules for first character substitution. * @return void The fields are sorted in the global variables $fields and $strippable fields. */ function get_fields($xml,$head_tag,$symbols,$first_chars) { $this->parser = xml_parser_create(); xml_set_object($this->parser,$this); xml_set_element_handler($this->parser,"startHandler","endHandler"); xml_set_character_data_handler($this->parser,"cDataHandler"); $this->head_tag = $head_tag; if(trim($symbols) != "") { $this->make_symbols_table($symbols); } if(trim($first_chars) != "") { $this->make_first_chars_table($first_chars); } $this->produce_stripped_version = false; $in = fopen($xml,"r"); while($line = fgets($in)) { xml_parse($this->parser,$line,false); } fclose($in); sort($this->strippable_fields); sort($this->fields); } /** * This function produces a modified version of an XML document based on conditions set forth in the required * parameters. * * First, set $produce_stripped_version to true. This tells the parsing functions which function is being * utilized.
    * Second, enter all the user conditions into the relevant global variables.
    * Third, parse the document and produce the modified XML document. * * @param string $xml The name of the $xml file to be modified. * @param string $new_xml_name The name of the new modified XML file. This is used in case the user wants to save * then new XML file. * @param string $head_tag The name of tag enclosing an entry. * @param string $sort_fields A space seperated list of fields which will be used to create the alpha field. * @param string $alpha_symbols The textarea containing the substitution rules for the alpha field. * @param string $alpha_firs_chars The textarea containing the substitution rules for the first character of the * alpha field. * @param string $symbols The textarea containing the substitution rules for the value of fields in $fields. * @param string $first_chars The textarea containing the substitution rules for the first character of fields * in $fields. * @param array $fields An array used to store a list of all fields within entries in the XML document. * @param array $fields_to_strip An array of the fields on which the substitution rules will be applied. */ function modify($xml,$new_xml_name, $head_tag, $sort_fields, $alpha_symbols, $alpha_first_chars, $symbols, $first_chars, $fields, $fields_to_strip) { $this->produce_stripped_version = true; $this->parser = xml_parser_create(); xml_set_object($this->parser,$this); xml_set_element_handler($this->parser,"startHandler","endHandler"); xml_set_character_data_handler($this->parser,"cDataHandler"); $this->fields = $fields; $this->fields_to_strip = $fields_to_strip; $this->head_tag = $head_tag; if($sort_fields != "") { $this->get_sort_fields($sort_fields); $this->has_sort_fields = true; $this->fields[] = "alpha"; } else { $this->has_sort_fields = false; } if(trim($symbols) != "") { $this->make_symbols_table($symbols); } if(trim($first_chars) != "") { $this->make_first_chars_table($first_chars); } if(trim($alpha_symbols) != "") { $this->make_alpha_symbols_table($alpha_symbols); } if(trim($alpha_first_chars) != "") { $this->make_alpha_first_chars_table($alpha_first_chars); } $this->out = fopen($new_xml_name,"w"); fwrite($this->out,"\n"); $in = fopen($xml,"r"); while($line = fgets($in)) { xml_parse($this->parser,$line,false); } fclose($in); fclose($this->out); xml_parser_free($this->parser); } } /* //NOTE: you must remove utf8_encode from the make_symbol_table functions when testing from the command line set_time_limit(0); $converter = new db_modifier(); $sort_fields = "lxam lxoa"; $symbols = "á = a\né = e\ní = i\nó = o\nú = u\nÁ = A\nÉ = E\nÍ = I\nÓ = O\nÚ = U\nÑ = N\nñ = n\nü = u"; $first_chars = "-' = ''"; $alpha_symbols = "á = a"; $alpha_first_chars = "-' = ''"; $fields_not_to_strip = array(); $converter->get_fields("test.xml","refgroup",$symbols,$first_chars); print_r($converter->fields); $fields_to_strip = array_diff($converter->strippable_fields,$fields_not_to_strip); $converter->modify("test.xml","with_stripped.xml","refgroup", $sort_fields,$alpha_symbols,$alpha_first_chars,$symbols,$first_chars, $converter->fields,$fields_to_strip); */ ?>