Changeset 675ed04
- Timestamp:
- 26/04/2012 17:08:28 (10 years ago)
- Branches:
- master
- Children:
- 466d447
- Parents:
- 6f64c7f
- git-author:
- Eric van der Vlist <vdv@dyomedea.com> (26/04/2012 17:08:28)
- git-committer:
- Eric van der Vlist <vdv@dyomedea.com> (26/04/2012 17:08:28)
- Location:
- archiver/pipelines/actions
- Files:
-
- 1 added
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
archiver/pipelines/actions/get-heritrix-warc.xpl
r51c2058 r675ed04 56 56 <p:processor name="oxf:pipeline"> 57 57 <p:input name="config" href="/data-access.xpl"/> 58 <p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list )">58 <p:input name="data" transform="oxf:xslt" href="aggregate('root', #data, #warc-dir-list, #heritrix-job)"> 59 59 <config xsl:version="2.0"> 60 60 <relpath>queue.xml</relpath> … … 79 79 <xsl:value-of select="/root/html/body/a[ends-with(., '.warc')][1]/@href"/> 80 80 </parameter> 81 <parameter name="log-url" type="string"> 82 <xsl:value-of select="/root/job/configFiles/value[key='loggerModule.crawlLogPath'][1]/url"/> 83 </parameter> 81 84 </config> 82 85 </p:input> … … 87 90 for $q in /queue return 88 91 update 89 insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url) />92 insert <action priority=$(priority) uuid="{util:uuid()}" type="package-heritrix-warc" url=$(url) directory=$(directory) heritrix-job-url=$(heritrix-job-url) warc-url=$(warc-url) log-url=$(log-url)/> 90 93 into $q, 91 94 -
archiver/pipelines/actions/package-heritrix-warc.xpl
rbe1a361 r675ed04 35 35 </p:processor> 36 36 37 <p:processor name="owk:from-warc-converter">38 <p:input name="data" href="#warc"/>39 <p:output name="data" id="warc-xml" debug="warc-xml"/>40 </p:processor>37 <p:processor name="owk:from-warc-converter"> 38 <p:input name="data" href="#warc"/> 39 <p:output name="data" id="warc-xml" debug="warc-xml"/> 40 </p:processor> 41 41 42 42 <p:processor name="oxf:null-serializer"> 43 43 <p:input name="data" href="#warc-xml"/> 44 44 </p:processor> 45 46 <!-- Store it in a temp file --> 45 46 <!-- Download the log --> 47 <p:processor name="oxf:url-generator"> 48 <p:input name="config" transform="oxf:xslt" href="#data"> 49 <config xsl:version="2.0"> 50 <url> 51 <xsl:value-of select="/action/@log-url"/> 52 </url> 53 <mode>text</mode> 54 <authentication> 55 <username> 56 <xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/username"/> 57 </username> 58 <password> 59 <xsl:value-of select="doc('oxf:/config.xml')/config/heritrix/password"/> 60 </password> 61 <preemptive>false</preemptive> 62 </authentication> 63 </config> 64 </p:input> 65 <p:output name="data" id="log" debug="log"/> 66 </p:processor> 67 68 <p:processor name="oxf:xslt"> 69 <p:input name="data" href="#log"/> 70 <p:input name="config" href="parse-log.xslt"></p:input> 71 <p:output name="data" id="log-xml" debug="log-xml"/> 72 </p:processor> 73 74 <p:processor name="oxf:null-serializer"> 75 <p:input name="data" href="#log-xml"/> 76 </p:processor> 77 78 79 <!-- Store the WARC in a temp file --> 47 80 <p:processor name="oxf:file-serializer"> 48 81 <p:input name="config"> … … 84 117 <p:output name="data" id="zip"/> 85 118 </p:processor> 86 119 87 120 <p:processor name="oxf:file-serializer"> 88 121 <p:input name="config"> … … 92 125 </p:input> 93 126 <p:input name="data" href="#zip"/> 94 95 </p:processor> 96 97 <!-- <p:choose href="#heritrix-job">127 128 </p:processor> 129 130 <!-- <p:choose href="#heritrix-job"> 98 131 <p:when test="/job/crawlControllerState='FINISHED'"> 99 132 <!-\- The job is finished, we can get its archive... -\->
Note: See TracChangeset
for help on using the changeset viewer.