Monday, October 5, 2015

[19.9] Compress XML (encoding XML)

1. Example


<family lastName="McDowell" state="CA">
    <person firstName="Gayle">Some Message</person>


</family>



Element --> Tag Attributes END(tag head end) Children END(tag tail end)
Attribute --> Tag Value
END --> 0
Tag --> some predefined mapping to int

Value --> string value END()

family -> l,  person ->2, frstName ->3, LastName -> 4j state -> 5).
<family lastName="McDowell" state="CA">
<person firstName="Gayle">Some Message</person>
</family>
Becomes:

1 4 McDowell 5 CA 0 2 3 Gayle 0 Some Message 0 0


// 1. Read tag name // Image

// 2. Read attributes
// 3. End of TagHead

// 4. End of tag , /> OR >

// 1. TagHead

// 2. Attribute

// 3. TagHead End

// 4. Tag Value

// 5. Tag's Children

// 6. TagTail End

<note>
<to>Tove</to>
<from>Jani</from>
<heading>Reminder</heading>
<body>Don't forget me this weekend!</body>
</note>

02
03Tove 01
04Jani 01
05Reminder 01
06Don't forget me this weekend! 01
01


<note date="12/11/2007">
<to>Tove</to>
<from>Jani</from>
</note>

http://chimera.labs.oreilly.com/books/1234000001805/ch24.html#the_butler_did_it
@XmlRootElement
public class Person {
    public String name;
    public Address address;
    public int age;
}
public class Address {
    public String city, street;
    public int number, zip;
}

<person>
    <name>Pat Niemeyer</name>
    <address>
        <city>St. Louis</city>
        <street>Java St.</street>
        <number>1234</number>
        <zip>54321</zip>
    </address>
    <age>42</age>
</person>
<!-- Simple -->
<Sentence>This is text.</Sentence>

<!-- Element -->
<Paragraph><Sentence>This is text.</Sentence></Paragraph>

<!-- Mixed -->
<Paragraph>
        <Sentence>This <verb>is</verb> text.</Sentence>
</Paragraph>

<!-- Empty -->
<PageBreak></PageBreak>

Attributes

<Document type="LEGAL"id="42">...</Document>
<Image name="truffle.jpg"/>



2. Implementation


Part1: Encoding



private Map tagMap; // tag gets mapped to a pre-defined integer value
private static final Byte[] END={0,1};
private List tokens;
private int currentTokenIndex;




byte[] encode(char[] input) throws IOException
{
     


     // STEP1:   Process Input      
     tokenize(input);
     currentTokenIndex = 0;





     // STEP2:   Encode processed input
     ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
     encodeTokens(outputStream);
   


}
void encodeTokens(ByteArrayOutputStream output)
{



     
     // 
     nextToken("<");



  
     // 1. Read tag name // Image
     String tagName = nextToken();
     output.write( getTagCode(tagName)  );




     // 2. Read attributes
     while ( !hasNextToken(">") &&  !hasNextTokens("/", ">")    )
     {



          // read next attribute
          String key = nextToken();   // name
          nextToken("=");             // =
          String value = nextToken(); // "truffle.jpg"



          
          output.write(  getTagCode(key)  );
          for(char c: vlaue.toCharArray() )
              output.write(c);
          // NOTE: after attribute value, it is the End
          output.write(END[0]);
          output.write(END[1]);
                


     }     




     // 3. End of TagHead
     output.write(END[0]);
     output.write(END[1]);    




     // 4. End of tag ,  /> OR >
     // CASE1: Attribute End like 
     if ( hasNextTokens("/", ">")) //  />
     {
          nextToken("/");
          nextToken(">");
     }
     // CASE2: Tag End like Gambardella, Matthew
     else 
     {




          nextToken(">");




          while(  !hasNextTokens("<","/"))
              encodeTokens(output); // Encode child
    


    
          // Encoding tag, 
nextToken("<");
          nextToken("/");
          nextToken(tagName);
          nextToken(">");




     }





     // NOTE: end of tag
     output.write(END[0]); 
     output.write(END[1]);





     

}

      Ralls, Kim
      Midnight Rain
      Fantasy
      5.95
      2000-12-16
      A former architect battles corporate zombies, 
      an evil sorceress, and her own childhood to become queen 
      of the world.
   


Part2: Compress further

You can treat file as a general stream of characters and use any number of comparison techniques: Shannon-Fano coding, Huffman coding or Arithmetic coding

//assuming a mapping of 
// family -> 1, 
// person ->2, 
// firstName -> 3
// lastName -> 4
// state -> 5
//                                       
// 
//    Some Message
// 
// Becomes:
//    1 4 McDowell 5 CA 0 2 3 Gayle 0 Some Message 0 0.




void encode(Element root, StringBuffer sb)
{

     // validate the input
     if (root == null)
          return;



     // 1. TagHead 
     encode(root.getTagName(), sb);




     // 2. Attribute
     for (Attribute a: root.attributes())
     {
          encode(a, sb);
     }



     // 3. TagHead End
     encode("0",sb);
     


     // 4. Tag Value
     if ( root.hasValue())
         encode(root.getVlaue(), sb);


   
     // 5. Tag's Children
     for (Element e: root.children())
     {
         encode(e, sb);
      }



     // 6. TagTail End
     encode("0", sb);



}



void encode(String s, StringBuffer sb)
{
     sb.append(s).append(" ");
}




void encode(Attribute a, StirngBuffer sb)
{
     encode(a.getTagName(), sb);
     encode(a.getValue(), sb);
}





class Element
{
     String getTagNmae() {return null;}
     Attribute[] attributes() {return null;}
     Element[] children() {return null;}
     String getValue() {return null;}
     boolena hasValue() {return false;}
}
class Attribute
{
     String getTagName() {return null;}
     String getValue() {return null;}
}
3. Similar Ones
https://github.com/wzhishen/cracking-the-coding-interview/blob/master/src/chap17/Q10.java

No comments:

Post a Comment