include_once("schema_loader.php5");
/**
* db_modifier produces a modified version of an xml document.
*
* Modifications include:
*
* - cDATA symbol substitution: substituting one type of symbol for another, e.g. 'a' for 'á'. The new content
* is placed within a new tag with a '_s', e.g. á becomes a.
* - Field removal: specified xml tags and associated content can be discluded in the modified xml document.
* - alphabetization tag: A new tag is created containing a value which will be used to sort query results.
* The value will be determined by a set of conditions supplied by the user.
*
*
* The parsing ability of this class actually provides two seperate but related parses.
*
* - The first function produces a list of all the field types that exist within the xml document.
* - The second function does the actual modifications and produces a new XML document.
*
* I chose to use the same class to provide both functions because it seemed a waste to include an entire new
* class to provide the functionality of creating a list of fields. In addition the results from this aggregation
* are used by the modification function indirectly. The search functionality reports back not only which fields
* exist in the XML document but also which fields contain characters contained in the $symbols and $first_chars arrays.
* This information is generally published to the user, allowing him to select which fields he wants to include in the
* MySQL version of his XML file and which fields he may want to add substituted versions of.
* Perhaps in a future version this class should be split into two.
* NOTE: In PHP4, you do not need to first decode cdata from utf8. PHP5 assumes the source is in utf8, so you must
* first decode the cdata.
*
* @package JLex5.0
* @author Jonathan Dick
* @date Dec. 30, 2005
*/
class db_modifier {
/**#@+
* @access public
*/
public $parser;
/**
* The file pointer for writing to disk.
* @var resource
*/
public $out;
/**
* The current cDATA.
* @var string
*/
public $cur_data;
/**
* The symbols to be substituted for. Note that I put these symbols and the substitutes in seperate arrays in order
* to utilize the php function str_replace which takes in a symbols array and a substitutes array.
* @var array of strings
*/
public $symbols;
/**
* An array of the substitue symbols.
* @var array of strings
*/
public $substitutes;
/**
* The user may supply substitution rules which only apply to the first character. This array contains
* the symbols which will be substituted for if found in the first character position.
* @var array of strings
*/
public $first_chars;
/**
* This is a regular expression composed of the values from the $symbols and $first_chars array.
*
* It is used to determine if the content of a particular field meets the conditions specified by
* the $symbols and $first_chars arrays, i.e. if it contains symbols found in those arrays.
* @var string
*/
public $pattern;
/**
* The set of symbols to be replaced from the alpha field.
* @var array of strings
*/
public $alpha_symbols;
/**
* The set of substitutes for the $alpha_symbols.
* @var array of strings
*/
public $alpha_substitutes;
/**
* The set of first characters which should be substituted for in the alpha field.
* @var array of strings
*/
public $alpha_first_chars;
/**
* The regular expression representing the conditions set forth in the $alpha_symbols and $alpha_first_chars arrays.
* @var string
*/
public $alpha_pattern;
/**
* The fields which will be used to create the alpha column.
*
* The first field will be used in the alpha column if it exists. If that field does not exist
* then the second field will be used and so on. The rules specified by the $alpha_symbols and
* $alpha_first_chars will then be applied to the value of the field selected.
* @var array of strings
*/
public $sort_fields;
/**
* A boolean variable which is true when sort fields are provided by the user.
*
*/
public $has_sort_fields;
/**
* The current value of the alpha column.
*
* Because the document is parsed in a linear fashion, we don't know if a field exists
* until we parse that field. Consequently, if any sort field is encountered we must save
* the value and only discard once we reach a sort field of higher precedence.
*/
public $alpha;
/**
* The index within the array of the field whose content currently exists in the $alpha variable.
*
* When a field contained in the $sort_fields array is parsed, it's index is retrieved from the array
* and compared to the value in $cur_alpha_index. If it is lower, the value of this field will replace
* the content currently stored in $alpha.
*
* @var integer
*/
public $cur_alpha_index;
/**
* The XML tag marking the beginning of an entry.
* @var string
*/
public $head_tag;
/**
* This boolean variable is true when the function being performed by this parse is modification.
* @var boolean
*/
public $produce_stripped_version;
/**
* The set of fields in the XML document which contain cDATA obeying the rules found in the
* $symbols and $first_chars arrays.
*
* This array is filled when parsing for field types, i.e. the first functionality discussed above.
*/
public $strippable_fields;
/**
* This variable holds two types of fieldsets depending on the function being implemented. If function 1
* (field search) is being used, this variable contains the growing list of fields found within entries in
* the XML document. If function 2 (modification) is being used, this contains the set of fields to be
* included in the modified version of the XML document.
* @var array
*/
public $fields;
/**
* The set of fields for which modified versions should be created based on the rules in the
* $symbols and $first_chars array.
* @var array
*/
public $fields_to_strip;
/**
* Boolean variable which is set to true one a start $head_tag is found and false when an end $head_tag is parsed.
* @var boolean
*/
public $in_entry;
/**#@-*/
function __construct()
{
$this->fields_to_strip = array();
$this->check_all_fields = false;
$this->count = 0;
$this->cur_data = "";
$this->symbols = array();
$this->substitutes = array();
$this->pattern = "";
$this->alpha_symbols = array();
$this->alpha_substitutes = array();
$this->alpha_first_chars = array();
$this->alpha_pattern = "";
$this->sort_fields = array();
$this->has_sort_fields = false;
$this->alpha = "";
$this->cur_alpha_index = 10;
$this->strippable_fields = array();
$this->fields = array();
$this->fields_to_strip = array();
$this->in_entry = false;
}
/**
* Extracts strings from space-seperated string and inputs them into the $sort_fields array.
* @param string $sort_fields A space seperated list of fields contained in the XML document.
* @return void The fields are inputted into the global variable $sort_fields.
*/
function get_sort_fields($sort_fields) {
$fields = explode(" ",$sort_fields);
foreach($fields as $field) {
$this->sort_fields[] = $field;
}
}
/**
* Extracts fields, one per line, from a file and returns them in an array.
*
* This function is no longer used.
* @param string $fields The filename containing the fields to be extracted.
* @return array The fields contained within the file.
*/
function get_fields_to_strip($fields) {
$in = fopen($fields,"r");
while($line = fgets($in)) {
$field = trim($line);
$fields_to_strip[] = $field;
}
return $fields_to_strip;
}
/**
* This function takes user input specifying substitution rules contained in a textarea within an HTML form
* and inputs the symbols into the $alpha_symbols array and their substitutes into the $alpha_substitutes array.
*
* Each line of the textarea should (if properly formatted by the user) a substitution rule in the form "X = Y".
* Additionally, the '' is used to specify that a symbol will be deleted rather than subsituted for, e.g. "X = ''".
* A user may specify multiple symbols to be replaed by a single substitute, e.g. "WXY = Z".
* Lastly, all the symbols which need to be subsituted for are concatenated and made into a regular expression
*
* @param string $textarea A textarea from an HTML form.
* @return void Only global variables are affected, no results are returned.
*/
function make_alpha_symbols_table($textarea) {
$textarea = stripslashes($textarea);
$lines = explode("\n",$textarea);
foreach($lines as $line) {
$vals = explode(" = ",$line);
$substitute = trim($vals[1]);
if($substitute == "''") {
$substitute = "";
}
$symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY);
foreach($symbols as $symbol) {
$this->alpha_symbols[] = $symbol;
$this->alpha_substitutes[] = $substitute;
}
}
$symbols_string = implode("",$this->alpha_symbols);
if(substr($symbols_string,-1,1) == "'") {
$symbols_string = "'".substr($symbols_string,0,-1);
}
if($symbols_string != "") {
$this->alpha_pattern = "[".$symbols_string."]";
}
}
/**
* This function takes user input specifying substitution rules contained in a textarea within an HTML form
* and inputs the results into $alpha_first chars, an associative array indexed by the symbol to be replaced
* and containing the substitute symbol.
*
* Note that in this case, two arrays are not used to store the symbols. This is because only the first character
* should be substituted for, not a set of characters.
* Each line of the textarea should (if properly formatted by the user) a substitution rule in the form "X = Y".
* Additionally, the '' is used to specify that a symbol will be deleted rather than subsituted for, e.g. "X = ''".
* A user may specify multiple symbols to be replaed by a single substitute, e.g. "WXY = Z".
* Lastly, all the symbols which need to be subsituted for are concatenated and made into a regular expression
*
* @param string $textarea A textarea from an HTML form.
* @return void Only global variables are affected, no results are returned.
*/
function make_alpha_first_chars_table($textarea) {
$textarea = trim(stripslashes($textarea));
$lines = explode("\n",$textarea);
foreach($lines as $line) {
$vals = explode(" = ",$line);
$substitute = trim($vals[1]);
if($substitute == "''") {
$substitute = "";
}
$symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY);
foreach($symbols as $symbol) {
$this->alpha_first_chars[$symbol] = $substitute;
}
}
$first_chars_string = trim(implode("",array_keys($this->alpha_first_chars)));
if(substr($first_chars_string,-1,1) == "'") {
$first_chars_string = "'".substr($first_chars_string,0,-1);
}
if($first_chars_string != "") {
if($this->alpha_pattern != "") {
$this->alpha_pattern .= "|";
}
$this->alpha_pattern .= "^[".$first_chars_string."]";
}
}
/**
* This function takes user input specifying substitution rules contained in a textarea within an HTML form
* and inputs the symbols into the $symbols array and their substitutes into the $substitutes array.
*
* Each line of the textarea should (if properly formatted by the user) a substitution rule in the form "X = Y".
* Additionally, the '' is used to specify that a symbol will be deleted rather than subsituted for, e.g. "X = ''".
* A user may specify multiple symbols to be replaed by a single substitute, e.g. "WXY = Z".
* Lastly, all the symbols which need to be subsituted for are concatenated and made into a regular expression
*
* @param string $textarea A textarea from an HTML form.
* @return void Only global variables are affected, no results are returned.
*/
function make_symbols_table($textarea) {
$textarea = stripslashes($textarea);
$lines = explode("\n",$textarea);
foreach($lines as $line) {
$vals = explode(" = ",$line);
$substitute = trim($vals[1]);
if($substitute == "''") {
$substitute = "";
}
$symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY);
foreach($symbols as $symbol) {
$this->symbols[] = $symbol;
$this->substitutes[] = $substitute;
}
}
$symbols_string = implode("",$this->symbols);
if(substr($symbols_string,-1,1) == "'") {
$symbols_string = "'".substr($symbols_string,0,-1);
}
if($symbols_string != "") {
$this->pattern = "[".$symbols_string."]";
}
}
/**
* This function takes user input specifying substitution rules contained in a textarea within an HTML form
* and inputs the results into $first chars, an associative array indexed by the symbol to be replaced
* and containing the substitute symbol.
*
* Note that in this case, two arrays are not used to store the symbols. This is because only the first character
* should be substituted for, not a set of characters.
* Each line of the textarea should (if properly formatted by the user) a substitution rule in the form "X = Y".
* Additionally, the '' is used to specify that a symbol will be deleted rather than subsituted for, e.g. "X = ''".
* A user may specify multiple symbols to be replaed by a single substitute, e.g. "WXY = Z".
* Lastly, all the symbols which need to be subsituted for are concatenated and made into a regular expression
*
* @param string $textarea A textarea from an HTML form.
* @return void Only global variables are affected, no results are returned.
*/
function make_first_chars_table($textarea) {
$textarea = trim(stripslashes($textarea));
$lines = explode("\n",$textarea);
foreach($lines as $line) {
$vals = explode(" = ",$line);
$substitute = trim($vals[1]);
if($substitute == "''") {
$substitute = "";
}
$symbols = preg_split('//', $vals[0], -1, PREG_SPLIT_NO_EMPTY);
foreach($symbols as $symbol) {
$this->first_chars[$symbol] = $substitute;
}
}
$first_chars_string = trim(implode("",array_keys($this->first_chars)));
if(substr($first_chars_string,-1,1) == "'") {
$first_chars_string = "'".substr($first_chars_string,0,-1);
}
if($first_chars_string != "") {
if($this->pattern != "") {
$this->pattern .= "|";
}
$this->pattern .= "^[".$first_chars_string."]";
}
}
/**
* Replace the specified $value with the stripped version according to the conditions
* specified by $alpha_symbols and $alpha_first_chars.
*
* @param string $value The content to be modified.
* @return string The modified content.
*/
function alpha_strip($value) {
$stripped = str_replace($this->alpha_symbols,$this->alpha_substitutes,$value);
$first_char = substr($stripped,0,1);
while(@array_key_exists($first_char,$this->alpha_first_chars)) {
$stripped = $this->alpha_first_chars[$first_char].substr($stripped,1);
$first_char = substr($stripped,0,1);
}
return $stripped;
}
/**
* Replace the specified $value with the stripped version according to the conditions
* specified by $alpha_symbols and $alpha_first_chars.
*
* @param string $value The content to be modified.
* @return string The modified content.
*/
function strip($value) {
$stripped = str_replace($this->symbols,$this->substitutes,$value);
$first_char = substr($stripped,0,1);
while(@array_key_exists($first_char,$this->first_chars)) {
$stripped = $this->first_chars[$first_char].substr($stripped,1);
$first_char = substr($stripped,0,1);
}
return $stripped;
}
/**
* The function which handles a new start tag during XML parsing.
*
* If function 1 (field searching) is used, do nothing.
* If function 2 (modification) is used, write the element to the new, modified XML document if:
*
* - The field exists withing $fields.
* - The field is outside of an entry.
*
*/
function startHandler($xp, $element, $attribs) {
$element = strtolower($element);
if($element == $this->head_tag) {
$this->in_entry = true;
}
if($this->produce_stripped_version) {
if($this->in_entry) {
if(in_array($element,$this->fields)) {
$element = str_replace("-","_",$element);
fwrite($this->out,"<$element>");
}
}
else {
fwrite($this->out,"<$element>\n");
}
}
}
/**
* The function for handling an end tag during an XML parse.
*
* If function 2 (modification) is being used:
*
* - Check to see if the field is a sort field and make the changes as necessary.
* - If the element is a head tag, we are at then end of an entry. So, write the alpha column
* into the modified XML document, set $alpha to the empty string and set $cur_alpha_index
* to the highest number it can be (this ensures that the first sort field found will be used).
* - If the element is in $fields, write $cur_data to disk and a close tag for this field.
* - If $in_entry is false, write the cDATA and a close tag. Nothing should be removed from the original
* XML document that is not explicitly specified by the modification rules.
*
* If function 1 (field search) is being used: if the element is not already there, add it to $fields.
* Lastly, check to see if the field meets the conditions set in $pattern. If there is a pattern match:
*
* - If in function 1, add the field (if it doesn't already exist) to $strippable_fields.
* - If in function 2: if the modified field exists in $fields_to_strip, write the modified field
* to the new XML document.
*
*/
function endHandler($xp, $element) {
$element = strtolower($element);
if($this->produce_stripped_version) {
$element = str_replace("-","_",$element);
if($this->has_sort_fields && in_array($element, $this->sort_fields)) {
$index = array_search($element,$this->sort_fields);
if($index < $this->cur_alpha_index) {
$this->alpha = $this->alpha_strip($this->cur_data);
$this->cur_alpha_index = $index;
}
}
if(($element == $this->head_tag) && $this->has_sort_fields) {
fwrite($this->out,"$this->alpha\n");
$this->alpha = "";
$this->cur_alpha_index = count($this->sort_fields);
}
if($this->in_entry) {
if(in_array($element,$this->fields)) {
fwrite($this->out,"$this->cur_data$element>\n");
}
}
else {
fwrite($this->out,"$this->cur_data$element>\n");
}
}
else {
if($this->in_entry) {
if(!in_array($element,$this->fields)) {
$this->fields[] = $element;
}
}
}
if($this->pattern != "") {
if(ereg($this->pattern,$this->cur_data)) {
if($this->produce_stripped_version) {
if(in_array($element,$this->fields_to_strip)) {
$stripped_value = $this->strip($this->cur_data);
fwrite($this->out,"<".$element."_s>$stripped_value".$element."_s>\n");
}
}
else {
if(!in_array($element,$this->strippable_fields)) {
$this->strippable_fields[] = $element;
}
}
}
}
if($element == $this->head_tag) {
$this->in_entry = false;
}
$this->cur_data = "";
}
/**
* Function for handling cDATA during parsing. If the cDATA is not an empty string, append it to then end
* of $cur_data.
*/
function cDataHandler($xp, $data) {
$data = trim($data);
if($data != "")
{
$data = ereg_replace("&","&",$data);
$data = ereg_replace("<","<",$data);
$data = ereg_replace(">",">",$data);
$this->cur_data .= $data;
}
}
/**
* A function to remove html encodings of special characters.
*/
function unhtmlentities ($string) {
$trans_tbl = get_html_translation_table (HTML_ENTITIES);
$trans_tbl = array_flip ($trans_tbl);
$ret = strtr ($string, $trans_tbl);
return preg_replace('/\&\#([0-9]+)\;/me',
"chr('\\1')",$ret);
}
/**
* Function for parsing an XML document and collecting the set of all fields used within entries
* and collecting the set of all fields containing characters meeting the $pattern conditions.
*
* As noted above, this is the first function, which only searches but produces no new XML document.
*
* @param string $xml The filename of the XML document to be parsed.
* @param string $head_tag The tag marking the beginning of an entry.
* @param $symbols string The textarea containing the rules for symbol substitition
* @param $first_chars The textarea containing the rules for first character substitution.
* @return void The fields are sorted in the global variables $fields and $strippable fields.
*/
function get_fields($xml,$head_tag,$symbols,$first_chars) {
$this->parser = xml_parser_create();
xml_set_object($this->parser,$this);
xml_set_element_handler($this->parser,"startHandler","endHandler");
xml_set_character_data_handler($this->parser,"cDataHandler");
$this->head_tag = $head_tag;
if(trim($symbols) != "") {
$this->make_symbols_table($symbols);
}
if(trim($first_chars) != "") {
$this->make_first_chars_table($first_chars);
}
$this->produce_stripped_version = false;
$in = fopen($xml,"r");
while($line = fgets($in)) {
xml_parse($this->parser,$line,false);
}
fclose($in);
sort($this->strippable_fields);
sort($this->fields);
}
/**
* This function produces a modified version of an XML document based on conditions set forth in the required
* parameters.
*
* First, set $produce_stripped_version to true. This tells the parsing functions which function is being
* utilized.
* Second, enter all the user conditions into the relevant global variables.
* Third, parse the document and produce the modified XML document.
*
* @param string $xml The name of the $xml file to be modified.
* @param string $new_xml_name The name of the new modified XML file. This is used in case the user wants to save
* then new XML file.
* @param string $head_tag The name of tag enclosing an entry.
* @param string $sort_fields A space seperated list of fields which will be used to create the alpha field.
* @param string $alpha_symbols The textarea containing the substitution rules for the alpha field.
* @param string $alpha_firs_chars The textarea containing the substitution rules for the first character of the
* alpha field.
* @param string $symbols The textarea containing the substitution rules for the value of fields in $fields.
* @param string $first_chars The textarea containing the substitution rules for the first character of fields
* in $fields.
* @param array $fields An array used to store a list of all fields within entries in the XML document.
* @param array $fields_to_strip An array of the fields on which the substitution rules will be applied.
*/
function modify($xml,$new_xml_name, $head_tag, $sort_fields,
$alpha_symbols, $alpha_first_chars,
$symbols, $first_chars, $fields,
$fields_to_strip) {
$this->produce_stripped_version = true;
$this->parser = xml_parser_create();
xml_set_object($this->parser,$this);
xml_set_element_handler($this->parser,"startHandler","endHandler");
xml_set_character_data_handler($this->parser,"cDataHandler");
$this->fields = $fields;
$this->fields_to_strip = $fields_to_strip;
$this->head_tag = $head_tag;
if($sort_fields != "") {
$this->get_sort_fields($sort_fields);
$this->has_sort_fields = true;
$this->fields[] = "alpha";
}
else {
$this->has_sort_fields = false;
}
if(trim($symbols) != "") {
$this->make_symbols_table($symbols);
}
if(trim($first_chars) != "") {
$this->make_first_chars_table($first_chars);
}
if(trim($alpha_symbols) != "") {
$this->make_alpha_symbols_table($alpha_symbols);
}
if(trim($alpha_first_chars) != "") {
$this->make_alpha_first_chars_table($alpha_first_chars);
}
$this->out = fopen($new_xml_name,"w");
fwrite($this->out,"\n");
$in = fopen($xml,"r");
while($line = fgets($in)) {
xml_parse($this->parser,$line,false);
}
fclose($in);
fclose($this->out);
xml_parser_free($this->parser);
}
}
/*
//NOTE: you must remove utf8_encode from the make_symbol_table functions when testing from the command line
set_time_limit(0);
$converter = new db_modifier();
$sort_fields = "lxam lxoa";
$symbols = "á = a\né = e\ní = i\nó = o\nú = u\nÁ = A\nÉ = E\nÍ = I\nÓ = O\nÚ = U\nÑ = N\nñ = n\nü = u";
$first_chars = "-' = ''";
$alpha_symbols = "á = a";
$alpha_first_chars = "-' = ''";
$fields_not_to_strip = array();
$converter->get_fields("test.xml","refgroup",$symbols,$first_chars);
print_r($converter->fields);
$fields_to_strip = array_diff($converter->strippable_fields,$fields_not_to_strip);
$converter->modify("test.xml","with_stripped.xml","refgroup",
$sort_fields,$alpha_symbols,$alpha_first_chars,$symbols,$first_chars,
$converter->fields,$fields_to_strip);
*/
?>